"""Gymnasium environment for the shepherd-dog herding task. Single-agent: the agent is the dog. Sheep are environment-controlled flocking agents whose dynamics are imported verbatim from ``herding.flocking_sim`` so a policy trained here transfers to Webots without re-tuning. Differential-drive kinematics for both dog and sheep match the proto specs (wheel radius, base, max wheel ω) via ``herding.diffdrive``. Action space ------------ Box(-1, 1, (2,)) — the dog's desired (vx, vy) velocity *intent*. This matches the high-level action representation the Webots controller already uses; the env converts (vx, vy) → wheel speeds with the same formula. Observation space ----------------- Box(-inf, inf, (28,)) — the order-invariant feature vector built by ``herding.obs.build_obs``. See ``herding/obs.py`` for the layout. Reset ----- ``options["n_sheep"]`` (1..MAX_SHEEP) overrides the default flock size for the next episode. If absent, flock size is sampled uniformly from [1, max_n_sheep] each reset, where ``max_n_sheep`` can be raised over training time by an outer callback. Reward ------ Sparse + shaping (see :func:`HerdingEnv._compute_reward` for weights). +2.0 per newly penned sheep +0.5 · ΔCoM-distance-to-pen (positive when CoM moves closer) +0.2 · ΔFlock-radius (positive when flock tightens) -0.005 per step (encourages speed) - wall and collision penalties +10.0 terminal bonus when all sheep penned """ from __future__ import annotations import math import os import random import sys from typing import Optional import gymnasium as gym import numpy as np from gymnasium import spaces # Make herding/ importable when run from anywhere. _HERE = os.path.dirname(os.path.abspath(__file__)) _PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, "..")) if _PROJECT_ROOT not in sys.path: sys.path.insert(0, _PROJECT_ROOT) from herding.world.diffdrive import ( heading_speed_to_wheels, kinematics_step, velocity_to_wheels, ) from herding.world.flocking_sim import ( FLEE_SPEED, MAX_SPEED, WANDER_SPEED, compute_heading_speed, ) from herding.world.geometry import ( DOG_MAX_LINEAR, DOG_MAX_WHEEL_OMEGA, DOG_SOUTH_LIMIT, DOG_WHEEL_BASE, DOG_WHEEL_RADIUS, FIELD_X, FIELD_Y, GATE_X, MAX_SHEEP, PEN_ENTRY, PEN_X, PEN_Y, SHEEP_MAX_WHEEL_OMEGA, SHEEP_WHEEL_BASE, SHEEP_WHEEL_RADIUS, WEBOTS_DT, is_penned_position, ) from herding.perception.lidar_perception import detections_from_scan from herding.perception.lidar_sim import simulate_scan from herding.obs import OBS_DIM, build_obs from herding.perception.sheep_tracker import SheepTracker from herding.control.strombom import compute_action as strombom_action class HerdingEnv(gym.Env): """Single-agent shepherd-dog herding env. Each step is one Webots ``basicTimeStep`` (16 ms). Episodes terminate when all sheep are penned, or after ``max_steps`` steps (truncation). """ metadata = {"render_modes": []} # Reward shaping weights. Re-tuned after the first run got stuck at # 0% success: progress reward must dominate the time penalty by a # large margin, and the pen-event bonus must be big enough that PPO's # advantage estimator can credit-assign across the long path that # leads to it. Per-step shaping is bounded by the clamps inside # _compute_reward. # Drastically simplified after two runs got stuck farming a position # bonus instead of penning sheep. Reward now is essentially: # • huge jackpot for actually penning sheep (+100 per pen, +500 done) # • small dense gradient: per-sheep mean distance to pen # No position shaping (gameable), no compactness shaping (gameable), # no engagement bonus (gameable). The terminal per-unpenned penalty # forbids "good enough" partial herds. # We have a working analytic baseline (Strömbom, 100 % on easy mode). # Use it as a teacher: per-step bonus proportional to the cosine # similarity between the policy's action and what Strömbom would do # at the same state. This drags the policy out of "do nothing" local # optima without locking it to the teacher — PPO can still find # improvements over Strömbom because pen jackpots dominate. W_PEN_DELTA = 100.0 W_PROGRESS = 20.0 W_IMITATE = 0.5 # per-step max ±0.5 (action cosine sim, [-1, 1]) W_TIME = 0.0 W_WALL = 0.0 W_COLLISION = 0.0 W_DONE = 500.0 # Action smoothing during training: 0 = none. The Webots controller # still applies its own EMA at inference for actuator stability, so # the policy doesn't need to learn smoothness explicitly. ACTION_SMOOTH = 0.0 # Episode budget. ~80 s of sim time at dt=0.016. The new external-pen # layout has paths up to ~28 m from spawn to pen entry; at sheep flee # speed ~0.4 m/s, that's 70 s minimum. 3000 steps (48 s) was leaving # the dog with no margin for collect-then-drive on multi-sheep cases. DEFAULT_MAX_STEPS = 5000 # Distance under which the dog is considered "colliding" with a sheep. COLLISION_DIST = 0.30 def __init__( self, n_sheep: Optional[int] = None, max_n_sheep: int = MAX_SHEEP, max_steps: int = DEFAULT_MAX_STEPS, difficulty: float = 0.0, seed: Optional[int] = None, use_lidar: bool = True, frame_stack: int = 1, ): super().__init__() # When True (default), the obs and the imitation-reward teacher # see only LiDAR-perceived sheep positions through a tracker — # matching what the Webots controller has access to. When False, # both consume ground-truth positions (legacy "privileged" mode, # kept for ablation). self._use_lidar = bool(use_lidar) self._tracker = SheepTracker() if self._use_lidar else None self._np_rng_lidar: Optional[np.random.Generator] = None # Frame stacking: the policy receives the last K single-frame # observations concatenated. Lets a memoryless MLP integrate # information across time, partly compensating for the limited # LiDAR FOV. K=1 reproduces the legacy single-frame obs. self._frame_stack = max(1, int(frame_stack)) self._frame_buffer: list[np.ndarray] = [] self.action_space = spaces.Box(-1.0, 1.0, shape=(2,), dtype=np.float32) self._single_obs_dim = OBS_DIM self.observation_space = spaces.Box( low=-np.inf, high=np.inf, shape=(OBS_DIM * self._frame_stack,), dtype=np.float32, ) # If n_sheep is None, env will sample uniformly from [1, max_n_sheep] # on every reset — this is the default for curriculum-free training. self._fixed_n_sheep = n_sheep self._max_n_sheep = max_n_sheep self.max_steps = max_steps # difficulty ∈ [0, 1]: 0 = sheep spawn next to the gate (easy), # 1 = sheep spawn anywhere in the field (hard, the deployment # distribution). Curriculum bumps this from 0 → 1 over training. self._difficulty = float(difficulty) self._initial_seed = seed # State (initialized in reset) self.dog_x = self.dog_y = self.dog_heading = 0.0 self.sheep_x = np.zeros(0, dtype=np.float32) self.sheep_y = np.zeros(0, dtype=np.float32) self.sheep_h = np.zeros(0, dtype=np.float32) self.sheep_penned = np.zeros(0, dtype=bool) self.sheep_wander = np.zeros(0, dtype=np.float32) self.prev_action = np.zeros(2, dtype=np.float32) self.smoothed_action = np.zeros(2, dtype=np.float32) self.steps = 0 self.n_sheep = 0 self.prev_n_penned = 0 self.prev_d_pen = 0.0 self.prev_radius = 0.0 # Env-owned RNG for the flocking wander-jitter, seeded fresh on each # reset so determinism is preserved without touching the global # random module. self._py_rng = random.Random() # ---- public knobs (used by curriculum callback) ---- def set_max_n_sheep(self, value: int) -> None: self._max_n_sheep = int(np.clip(value, 1, MAX_SHEEP)) def set_difficulty(self, value: float) -> None: self._difficulty = float(np.clip(value, 0.0, 1.0)) def set_imitate_weight(self, value: float) -> None: """Override W_IMITATE (instance-level) — used to disable the Strömbom imitation reward during BC fine-tuning, when the policy already mimics a stronger teacher (sequential).""" self.W_IMITATE = float(value) def set_time_weight(self, value: float) -> None: """Override W_TIME (instance-level). Default 0.0; a small negative value (e.g. -0.1) adds a per-step penalty that explicitly rewards fast time-to-pen during PPO fine-tune.""" self.W_TIME = float(value) # ---- gym API ---- def reset(self, *, seed=None, options=None): super().reset(seed=seed) # Re-seed the flocking RNG from np_random so flocking jitter is # reproducible alongside everything else the env samples. self._py_rng.seed(int(self.np_random.integers(0, 2**31 - 1))) opts = options or {} if "n_sheep" in opts and opts["n_sheep"] is not None: self.n_sheep = int(opts["n_sheep"]) elif self._fixed_n_sheep is not None: self.n_sheep = int(self._fixed_n_sheep) else: self.n_sheep = int(self.np_random.integers(1, self._max_n_sheep + 1)) # Dog spawns near origin with random heading. self.dog_x = float(self.np_random.uniform(-2.5, 2.5)) self.dog_y = float(self.np_random.uniform(-2.5, 2.5)) self.dog_heading = float(self.np_random.uniform(-math.pi, math.pi)) # Sheep spawn region scales with difficulty: # 0.0 → narrow box just north of the gate (x ∈ [7, 14], y ∈ [-12, -6]) # 1.0 → full field (x ∈ [-13, 13], y ∈ [-12, 13]) # Linear interpolation between the two for intermediate values. d = self._difficulty sx_lo = 7.0 - d * 20.0 # → -13 at d=1 sx_hi = 14.0 - d * 1.0 # → 13 at d=1 sy_lo = -12.0 + d * 0.0 # → -12 at d=1 sy_hi = -6.0 + d * 19.0 # → 13 at d=1 sxs, sys_, shs, sws = [], [], [], [] for _ in range(self.n_sheep): for _try in range(100): sx = float(self.np_random.uniform(sx_lo, sx_hi)) sy = float(self.np_random.uniform(sy_lo, sy_hi)) # Reject too close to dog or to other sheep. if math.hypot(sx - self.dog_x, sy - self.dog_y) < 3.0: continue if any(math.hypot(sx - x, sy - y) < 1.5 for x, y in zip(sxs, sys_)): continue # Reject inside the gate column already (they'd start "penned"). if PEN_X[0] <= sx <= PEN_X[1] and sy < -8.0: continue break sxs.append(sx); sys_.append(sy) shs.append(float(self.np_random.uniform(-math.pi, math.pi))) sws.append(float(self.np_random.uniform(-math.pi, math.pi))) self.sheep_x = np.asarray(sxs, dtype=np.float32) self.sheep_y = np.asarray(sys_, dtype=np.float32) self.sheep_h = np.asarray(shs, dtype=np.float32) self.sheep_wander = np.asarray(sws, dtype=np.float32) self.sheep_penned = np.zeros(self.n_sheep, dtype=bool) self.prev_action = np.zeros(2, dtype=np.float32) self.smoothed_action = np.zeros(2, dtype=np.float32) self.steps = 0 self.prev_n_penned = 0 self.prev_d_pen, self.prev_radius = self._flock_metrics() if self._tracker is not None: self._tracker.reset() self._np_rng_lidar = np.random.default_rng( int(self.np_random.integers(0, 2**31 - 1))) # Prime the tracker with one scan so the first obs isn't empty. self._update_tracker() # Clear the frame stack — the next _build_obs will repopulate. self._frame_buffer = [] obs = self._build_obs() info = {"n_sheep": self.n_sheep} return obs, info def step(self, action): action = np.clip(np.asarray(action, dtype=np.float32), -1.0, 1.0) # EMA smoothing — the Webots controller does this too. self.smoothed_action = ( self.ACTION_SMOOTH * self.prev_action + (1.0 - self.ACTION_SMOOTH) * action ) self.prev_action = self.smoothed_action.copy() vx, vy = float(self.smoothed_action[0]), float(self.smoothed_action[1]) # Safety supervisor mirrored from the controller — keeps the dog # north of the gate so the policy can't strand itself in the pen. if self.dog_y < DOG_SOUTH_LIMIT and vy < 0.0: vx, vy = 0.0, 1.0 # --- Step the dog --- wL, wR = velocity_to_wheels( vx, vy, self.dog_heading, max_linear=DOG_MAX_LINEAR, wheel_radius=DOG_WHEEL_RADIUS, max_wheel_omega=DOG_MAX_WHEEL_OMEGA, k_turn=4.0, ) self.dog_x, self.dog_y, self.dog_heading = kinematics_step( self.dog_x, self.dog_y, self.dog_heading, wL, wR, DOG_WHEEL_RADIUS, DOG_WHEEL_BASE, WEBOTS_DT, ) # Clip dog to field bounds and out of pen — same as the Webots stone walls. self.dog_x = float(np.clip(self.dog_x, FIELD_X[0] + 0.3, FIELD_X[1] - 0.3)) self.dog_y = float(np.clip(self.dog_y, DOG_SOUTH_LIMIT, FIELD_Y[1] - 0.3)) # --- Step each sheep --- for i in range(self.n_sheep): self._step_one_sheep(i) # --- Update penned state --- for i in range(self.n_sheep): if (not self.sheep_penned[i] and is_penned_position(self.sheep_x[i], self.sheep_y[i])): self.sheep_penned[i] = True # --- Run LiDAR perception on this step's state (after sheep have # moved). Updates the tracker that obs and the imitation- # reward teacher consume. Reward / termination still use GT. --- if self._tracker is not None: self._update_tracker() # --- Reward, termination --- d_pen, radius = self._flock_metrics() reward = self._compute_reward(d_pen, radius, action=action) self.prev_d_pen = d_pen self.prev_radius = radius self.prev_n_penned = int(self.sheep_penned.sum()) self.steps += 1 all_penned = bool(self.sheep_penned.all()) terminated = all_penned truncated = self.steps >= self.max_steps if all_penned: reward += self.W_DONE # No timeout penalty: a per-unpenned penalty made "do nothing" # strictly preferable to noisy-random under reward-progress shaping # (random sometimes pushes sheep away → negative progress, then # always ate the timeout penalty), which collapsed exploration to # tiny actions. The pen jackpot alone provides the directional # signal once exploration is wide enough to find it. obs = self._build_obs() info = { "n_sheep": self.n_sheep, "n_penned": self.prev_n_penned, "is_success": all_penned, "steps": self.steps, } return obs, float(reward), terminated, truncated, info # ---- internals ---- def _step_one_sheep(self, i: int) -> None: x, y = float(self.sheep_x[i]), float(self.sheep_y[i]) peers = [(float(self.sheep_x[j]), float(self.sheep_y[j])) for j in range(self.n_sheep) if j != i] heading, speed_motor, new_wander = compute_heading_speed( x, y, penned=bool(self.sheep_penned[i]), dog_xy=(self.dog_x, self.dog_y), peers=peers, wander_angle=float(self.sheep_wander[i]), rng=self._py_rng, ) self.sheep_wander[i] = new_wander wL, wR = heading_speed_to_wheels( heading, speed_motor, float(self.sheep_h[i]), max_wheel_omega=SHEEP_MAX_WHEEL_OMEGA, k_turn=4.0, ) nx, ny, nh = kinematics_step( x, y, float(self.sheep_h[i]), wL, wR, SHEEP_WHEEL_RADIUS, SHEEP_WHEEL_BASE, WEBOTS_DT, ) # Wall clipping — matches Webots stone walls, except in the gate column # where the south wall is absent. nx = float(np.clip(nx, FIELD_X[0] + 0.2, FIELD_X[1] - 0.2)) in_gate_col = PEN_X[0] <= nx <= PEN_X[1] if in_gate_col: ny = float(np.clip(ny, PEN_Y[0] + 0.2, FIELD_Y[1] - 0.2)) else: ny = float(np.clip(ny, FIELD_Y[0] + 0.2, FIELD_Y[1] - 0.2)) self.sheep_x[i] = nx self.sheep_y[i] = ny self.sheep_h[i] = nh def _flock_metrics(self): """(per-sheep mean distance to pen entry, max-radius). Using the per-sheep mean instead of CoM-distance ensures stragglers keep contributing to the progress signal — the dog can't game the shaping by herding the bulk of the flock and abandoning one outlier (CoM moves toward pen, but mean-distance doesn't). """ active_mask = ~self.sheep_penned if not active_mask.any(): return 0.0, 0.0 xs = self.sheep_x[active_mask] ys = self.sheep_y[active_mask] per_sheep_d = np.hypot(xs - PEN_ENTRY[0], ys - PEN_ENTRY[1]) d_pen = float(per_sheep_d.mean()) com_x, com_y = float(xs.mean()), float(ys.mean()) if active_mask.sum() == 1: radius = 0.0 else: radius = float(np.hypot(xs - com_x, ys - com_y).max()) return d_pen, radius def _compute_reward(self, d_pen: float, radius: float, action=None) -> float: """Sparse + per-sheep distance shaping + Strömbom imitation. d_pen is the *mean* distance over active sheep, so progress only accrues when ALL active sheep get closer to the pen on average — the dog can't farm it by herding one sheep while ignoring others. The imitation term is computed by querying Strömbom for the recommended action at the *current* (post-step) state and rewarding cosine similarity with what the policy actually did. """ n_penned = int(self.sheep_penned.sum()) delta_pen = n_penned - self.prev_n_penned d_progress = max(-5.0, min(5.0, self.prev_d_pen - d_pen)) r = self.W_PEN_DELTA * delta_pen + self.W_PROGRESS * d_progress # Per-step time penalty (0 by default). When negative, encourages # the policy to finish quickly — used during PPO fine-tune. r += self.W_TIME if action is not None and self.W_IMITATE > 0.0: positions = self._perceived_positions() if positions: sx, sy, _mode = strombom_action( (self.dog_x, self.dog_y), positions, PEN_ENTRY, ) a_norm = math.hypot(float(action[0]), float(action[1])) s_norm = math.hypot(sx, sy) if a_norm > 1e-3 and s_norm > 1e-3: cos_sim = (float(action[0]) * sx + float(action[1]) * sy) / (a_norm * s_norm) r += self.W_IMITATE * cos_sim return float(r) def _build_single_obs(self) -> np.ndarray: if self._tracker is not None: # Obs sees only the tracker's active set; penned tracks are # intentionally excluded (matches the prior receiver-based # behaviour where penned sheep stopped contributing to the # symbolic obs). active = self._tracker.get_positions() sheep_xy_list = list(active.values()) sheep_penned_list = [False] * len(sheep_xy_list) else: sheep_xy_list = list(zip(self.sheep_x.tolist(), self.sheep_y.tolist())) sheep_penned_list = self.sheep_penned.tolist() return build_obs( (self.dog_x, self.dog_y), self.dog_heading, sheep_xy_list, sheep_penned_list, n_max=self._max_n_sheep, ) def _build_obs(self) -> np.ndarray: single = self._build_single_obs() if self._frame_stack <= 1: return single # On a fresh reset the buffer is empty — duplicate the first # frame so the stack is always full-length. if not self._frame_buffer: self._frame_buffer = [single.copy() for _ in range(self._frame_stack)] else: self._frame_buffer.append(single) if len(self._frame_buffer) > self._frame_stack: self._frame_buffer = self._frame_buffer[-self._frame_stack:] # Concatenate oldest → newest. return np.concatenate(self._frame_buffer, axis=0).astype(np.float32) # ------------------------------------------------------------------ # LiDAR perception helpers # ------------------------------------------------------------------ def _all_sheep_xy(self) -> list[tuple[float, float]]: """Every sheep, including penned ones (the LiDAR sees them).""" return [(float(self.sheep_x[i]), float(self.sheep_y[i])) for i in range(self.n_sheep)] def _update_tracker(self) -> None: ranges = simulate_scan( self.dog_x, self.dog_y, self.dog_heading, self._all_sheep_xy(), rng=self._np_rng_lidar, ) detections = detections_from_scan( ranges, self.dog_x, self.dog_y, self.dog_heading, ) self._tracker.update(detections) def perceived_positions(self) -> dict[str, tuple[float, float]]: """Public accessor — what the controller would 'see' this step. LiDAR mode → the tracker's active set. Privileged mode → ground-truth active sheep. Used by ``training.eval`` and ``tools.collect_demos`` so analytic teachers run on the same perception the deployed controller has. """ if self._tracker is not None: return self._tracker.get_positions() return {f"s{i}": (float(self.sheep_x[i]), float(self.sheep_y[i])) for i in range(self.n_sheep) if not self.sheep_penned[i]} # Internal alias so the imitation reward path doesn't need to know # which mode it's in. _perceived_positions = perceived_positions