518 lines
22 KiB
Python
518 lines
22 KiB
Python
"""Gymnasium environment for the shepherd-dog herding task.
|
|
|
|
Single-agent: the agent is the dog. Sheep are environment-controlled
|
|
flocking agents whose dynamics are imported verbatim from
|
|
``herding.flocking_sim`` so a policy trained here transfers to Webots
|
|
without re-tuning. Differential-drive kinematics for both dog and sheep
|
|
match the proto specs (wheel radius, base, max wheel ω) via
|
|
``herding.diffdrive``.
|
|
|
|
Action space
|
|
------------
|
|
Box(-1, 1, (2,)) — the dog's desired (vx, vy) velocity *intent*. This
|
|
matches the high-level action representation the Webots controller
|
|
already uses; the env converts (vx, vy) → wheel speeds with the same
|
|
formula.
|
|
|
|
Observation space
|
|
-----------------
|
|
Box(-inf, inf, (28,)) — the order-invariant feature vector built by
|
|
``herding.obs.build_obs``. See ``herding/obs.py`` for the layout.
|
|
|
|
Reset
|
|
-----
|
|
``options["n_sheep"]`` (1..MAX_SHEEP) overrides the default flock size
|
|
for the next episode. If absent, flock size is sampled uniformly from
|
|
[1, max_n_sheep] each reset, where ``max_n_sheep`` can be raised over
|
|
training time by an outer callback.
|
|
|
|
Reward
|
|
------
|
|
Sparse + shaping (see :func:`HerdingEnv._compute_reward` for weights).
|
|
|
|
+2.0 per newly penned sheep
|
|
+0.5 · ΔCoM-distance-to-pen (positive when CoM moves closer)
|
|
+0.2 · ΔFlock-radius (positive when flock tightens)
|
|
-0.005 per step (encourages speed)
|
|
- wall and collision penalties
|
|
+10.0 terminal bonus when all sheep penned
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import math
|
|
import os
|
|
import random
|
|
import sys
|
|
from typing import Optional
|
|
|
|
import gymnasium as gym
|
|
import numpy as np
|
|
from gymnasium import spaces
|
|
|
|
# Make herding/ importable when run from anywhere.
|
|
_HERE = os.path.dirname(os.path.abspath(__file__))
|
|
_PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, ".."))
|
|
if _PROJECT_ROOT not in sys.path:
|
|
sys.path.insert(0, _PROJECT_ROOT)
|
|
|
|
from herding.diffdrive import (
|
|
heading_speed_to_wheels, kinematics_step, velocity_to_wheels,
|
|
)
|
|
from herding.flocking_sim import (
|
|
FLEE_SPEED, MAX_SPEED, WANDER_SPEED, compute_heading_speed,
|
|
)
|
|
from herding.geometry import (
|
|
DOG_MAX_LINEAR, DOG_MAX_WHEEL_OMEGA, DOG_SOUTH_LIMIT, DOG_WHEEL_BASE,
|
|
DOG_WHEEL_RADIUS, FIELD_X, FIELD_Y, GATE_X, MAX_SHEEP,
|
|
PEN_ENTRY, PEN_X, PEN_Y,
|
|
SHEEP_MAX_WHEEL_OMEGA, SHEEP_WHEEL_BASE, SHEEP_WHEEL_RADIUS,
|
|
WEBOTS_DT, is_penned_position,
|
|
)
|
|
from herding.lidar_perception import detections_from_scan
|
|
from herding.lidar_sim import simulate_scan
|
|
from herding.obs import OBS_DIM, build_obs
|
|
from herding.sheep_tracker import SheepTracker
|
|
from herding.strombom import compute_action as strombom_action
|
|
|
|
|
|
class HerdingEnv(gym.Env):
|
|
"""Single-agent shepherd-dog herding env.
|
|
|
|
Each step is one Webots ``basicTimeStep`` (16 ms). Episodes terminate
|
|
when all sheep are penned, or after ``max_steps`` steps (truncation).
|
|
"""
|
|
|
|
metadata = {"render_modes": []}
|
|
|
|
# Reward shaping weights. Re-tuned after the first run got stuck at
|
|
# 0% success: progress reward must dominate the time penalty by a
|
|
# large margin, and the pen-event bonus must be big enough that PPO's
|
|
# advantage estimator can credit-assign across the long path that
|
|
# leads to it. Per-step shaping is bounded by the clamps inside
|
|
# _compute_reward.
|
|
# Drastically simplified after two runs got stuck farming a position
|
|
# bonus instead of penning sheep. Reward now is essentially:
|
|
# • huge jackpot for actually penning sheep (+100 per pen, +500 done)
|
|
# • small dense gradient: per-sheep mean distance to pen
|
|
# No position shaping (gameable), no compactness shaping (gameable),
|
|
# no engagement bonus (gameable). The terminal per-unpenned penalty
|
|
# forbids "good enough" partial herds.
|
|
# We have a working analytic baseline (Strömbom, 100 % on easy mode).
|
|
# Use it as a teacher: per-step bonus proportional to the cosine
|
|
# similarity between the policy's action and what Strömbom would do
|
|
# at the same state. This drags the policy out of "do nothing" local
|
|
# optima without locking it to the teacher — PPO can still find
|
|
# improvements over Strömbom because pen jackpots dominate.
|
|
W_PEN_DELTA = 100.0
|
|
W_PROGRESS = 20.0
|
|
W_IMITATE = 0.5 # per-step max ±0.5 (action cosine sim, [-1, 1])
|
|
W_TIME = 0.0
|
|
W_WALL = 0.0
|
|
W_COLLISION = 0.0
|
|
W_DONE = 500.0
|
|
|
|
# Action smoothing during training: 0 = none. The Webots controller
|
|
# still applies its own EMA at inference for actuator stability, so
|
|
# the policy doesn't need to learn smoothness explicitly.
|
|
ACTION_SMOOTH = 0.0
|
|
|
|
# Episode budget. ~80 s of sim time at dt=0.016. The new external-pen
|
|
# layout has paths up to ~28 m from spawn to pen entry; at sheep flee
|
|
# speed ~0.4 m/s, that's 70 s minimum. 3000 steps (48 s) was leaving
|
|
# the dog with no margin for collect-then-drive on multi-sheep cases.
|
|
DEFAULT_MAX_STEPS = 5000
|
|
|
|
# Distance under which the dog is considered "colliding" with a sheep.
|
|
COLLISION_DIST = 0.30
|
|
|
|
def __init__(
|
|
self,
|
|
n_sheep: Optional[int] = None,
|
|
max_n_sheep: int = MAX_SHEEP,
|
|
max_steps: int = DEFAULT_MAX_STEPS,
|
|
difficulty: float = 0.0,
|
|
seed: Optional[int] = None,
|
|
use_lidar: bool = True,
|
|
frame_stack: int = 1,
|
|
):
|
|
super().__init__()
|
|
# When True (default), the obs and the imitation-reward teacher
|
|
# see only LiDAR-perceived sheep positions through a tracker —
|
|
# matching what the Webots controller has access to. When False,
|
|
# both consume ground-truth positions (legacy "privileged" mode,
|
|
# kept for ablation).
|
|
self._use_lidar = bool(use_lidar)
|
|
self._tracker = SheepTracker() if self._use_lidar else None
|
|
self._np_rng_lidar: Optional[np.random.Generator] = None
|
|
|
|
# Frame stacking: the policy receives the last K single-frame
|
|
# observations concatenated. Lets a memoryless MLP integrate
|
|
# information across time, partly compensating for the limited
|
|
# LiDAR FOV. K=1 reproduces the legacy single-frame obs.
|
|
self._frame_stack = max(1, int(frame_stack))
|
|
self._frame_buffer: list[np.ndarray] = []
|
|
self.action_space = spaces.Box(-1.0, 1.0, shape=(2,), dtype=np.float32)
|
|
self._single_obs_dim = OBS_DIM
|
|
self.observation_space = spaces.Box(
|
|
low=-np.inf, high=np.inf,
|
|
shape=(OBS_DIM * self._frame_stack,), dtype=np.float32,
|
|
)
|
|
|
|
# If n_sheep is None, env will sample uniformly from [1, max_n_sheep]
|
|
# on every reset — this is the default for curriculum-free training.
|
|
self._fixed_n_sheep = n_sheep
|
|
self._max_n_sheep = max_n_sheep
|
|
self.max_steps = max_steps
|
|
# difficulty ∈ [0, 1]: 0 = sheep spawn next to the gate (easy),
|
|
# 1 = sheep spawn anywhere in the field (hard, the deployment
|
|
# distribution). Curriculum bumps this from 0 → 1 over training.
|
|
self._difficulty = float(difficulty)
|
|
self._initial_seed = seed
|
|
|
|
# State (initialized in reset)
|
|
self.dog_x = self.dog_y = self.dog_heading = 0.0
|
|
self.sheep_x = np.zeros(0, dtype=np.float32)
|
|
self.sheep_y = np.zeros(0, dtype=np.float32)
|
|
self.sheep_h = np.zeros(0, dtype=np.float32)
|
|
self.sheep_penned = np.zeros(0, dtype=bool)
|
|
self.sheep_wander = np.zeros(0, dtype=np.float32)
|
|
|
|
self.prev_action = np.zeros(2, dtype=np.float32)
|
|
self.smoothed_action = np.zeros(2, dtype=np.float32)
|
|
self.steps = 0
|
|
self.n_sheep = 0
|
|
self.prev_n_penned = 0
|
|
self.prev_d_pen = 0.0
|
|
self.prev_radius = 0.0
|
|
|
|
# Env-owned RNG for the flocking wander-jitter, seeded fresh on each
|
|
# reset so determinism is preserved without touching the global
|
|
# random module.
|
|
self._py_rng = random.Random()
|
|
|
|
# ---- public knobs (used by curriculum callback) ----
|
|
def set_max_n_sheep(self, value: int) -> None:
|
|
self._max_n_sheep = int(np.clip(value, 1, MAX_SHEEP))
|
|
|
|
def set_difficulty(self, value: float) -> None:
|
|
self._difficulty = float(np.clip(value, 0.0, 1.0))
|
|
|
|
def set_imitate_weight(self, value: float) -> None:
|
|
"""Override W_IMITATE (instance-level) — used to disable the
|
|
Strömbom imitation reward during BC fine-tuning, when the policy
|
|
already mimics a stronger teacher (sequential)."""
|
|
self.W_IMITATE = float(value)
|
|
|
|
# ---- gym API ----
|
|
def reset(self, *, seed=None, options=None):
|
|
super().reset(seed=seed)
|
|
# Re-seed the flocking RNG from np_random so flocking jitter is
|
|
# reproducible alongside everything else the env samples.
|
|
self._py_rng.seed(int(self.np_random.integers(0, 2**31 - 1)))
|
|
opts = options or {}
|
|
|
|
if "n_sheep" in opts and opts["n_sheep"] is not None:
|
|
self.n_sheep = int(opts["n_sheep"])
|
|
elif self._fixed_n_sheep is not None:
|
|
self.n_sheep = int(self._fixed_n_sheep)
|
|
else:
|
|
self.n_sheep = int(self.np_random.integers(1, self._max_n_sheep + 1))
|
|
|
|
# Dog spawns near origin with random heading.
|
|
self.dog_x = float(self.np_random.uniform(-2.5, 2.5))
|
|
self.dog_y = float(self.np_random.uniform(-2.5, 2.5))
|
|
self.dog_heading = float(self.np_random.uniform(-math.pi, math.pi))
|
|
|
|
# Sheep spawn region scales with difficulty:
|
|
# 0.0 → narrow box just north of the gate (x ∈ [7, 14], y ∈ [-12, -6])
|
|
# 1.0 → full field (x ∈ [-13, 13], y ∈ [-12, 13])
|
|
# Linear interpolation between the two for intermediate values.
|
|
d = self._difficulty
|
|
sx_lo = 7.0 - d * 20.0 # → -13 at d=1
|
|
sx_hi = 14.0 - d * 1.0 # → 13 at d=1
|
|
sy_lo = -12.0 + d * 0.0 # → -12 at d=1
|
|
sy_hi = -6.0 + d * 19.0 # → 13 at d=1
|
|
|
|
sxs, sys_, shs, sws = [], [], [], []
|
|
for _ in range(self.n_sheep):
|
|
for _try in range(100):
|
|
sx = float(self.np_random.uniform(sx_lo, sx_hi))
|
|
sy = float(self.np_random.uniform(sy_lo, sy_hi))
|
|
# Reject too close to dog or to other sheep.
|
|
if math.hypot(sx - self.dog_x, sy - self.dog_y) < 3.0:
|
|
continue
|
|
if any(math.hypot(sx - x, sy - y) < 1.5
|
|
for x, y in zip(sxs, sys_)):
|
|
continue
|
|
# Reject inside the gate column already (they'd start "penned").
|
|
if PEN_X[0] <= sx <= PEN_X[1] and sy < -8.0:
|
|
continue
|
|
break
|
|
sxs.append(sx); sys_.append(sy)
|
|
shs.append(float(self.np_random.uniform(-math.pi, math.pi)))
|
|
sws.append(float(self.np_random.uniform(-math.pi, math.pi)))
|
|
|
|
self.sheep_x = np.asarray(sxs, dtype=np.float32)
|
|
self.sheep_y = np.asarray(sys_, dtype=np.float32)
|
|
self.sheep_h = np.asarray(shs, dtype=np.float32)
|
|
self.sheep_wander = np.asarray(sws, dtype=np.float32)
|
|
self.sheep_penned = np.zeros(self.n_sheep, dtype=bool)
|
|
|
|
self.prev_action = np.zeros(2, dtype=np.float32)
|
|
self.smoothed_action = np.zeros(2, dtype=np.float32)
|
|
self.steps = 0
|
|
self.prev_n_penned = 0
|
|
self.prev_d_pen, self.prev_radius = self._flock_metrics()
|
|
|
|
if self._tracker is not None:
|
|
self._tracker.reset()
|
|
self._np_rng_lidar = np.random.default_rng(
|
|
int(self.np_random.integers(0, 2**31 - 1)))
|
|
# Prime the tracker with one scan so the first obs isn't empty.
|
|
self._update_tracker()
|
|
|
|
# Clear the frame stack — the next _build_obs will repopulate.
|
|
self._frame_buffer = []
|
|
|
|
obs = self._build_obs()
|
|
info = {"n_sheep": self.n_sheep}
|
|
return obs, info
|
|
|
|
def step(self, action):
|
|
action = np.clip(np.asarray(action, dtype=np.float32), -1.0, 1.0)
|
|
|
|
# EMA smoothing — the Webots controller does this too.
|
|
self.smoothed_action = (
|
|
self.ACTION_SMOOTH * self.prev_action
|
|
+ (1.0 - self.ACTION_SMOOTH) * action
|
|
)
|
|
self.prev_action = self.smoothed_action.copy()
|
|
vx, vy = float(self.smoothed_action[0]), float(self.smoothed_action[1])
|
|
|
|
# Safety supervisor mirrored from the controller — keeps the dog
|
|
# north of the gate so the policy can't strand itself in the pen.
|
|
if self.dog_y < DOG_SOUTH_LIMIT and vy < 0.0:
|
|
vx, vy = 0.0, 1.0
|
|
|
|
# --- Step the dog ---
|
|
wL, wR = velocity_to_wheels(
|
|
vx, vy, self.dog_heading,
|
|
max_linear=DOG_MAX_LINEAR,
|
|
wheel_radius=DOG_WHEEL_RADIUS,
|
|
max_wheel_omega=DOG_MAX_WHEEL_OMEGA,
|
|
k_turn=4.0,
|
|
)
|
|
self.dog_x, self.dog_y, self.dog_heading = kinematics_step(
|
|
self.dog_x, self.dog_y, self.dog_heading,
|
|
wL, wR, DOG_WHEEL_RADIUS, DOG_WHEEL_BASE, WEBOTS_DT,
|
|
)
|
|
# Clip dog to field bounds and out of pen — same as the Webots stone walls.
|
|
self.dog_x = float(np.clip(self.dog_x, FIELD_X[0] + 0.3, FIELD_X[1] - 0.3))
|
|
self.dog_y = float(np.clip(self.dog_y, DOG_SOUTH_LIMIT, FIELD_Y[1] - 0.3))
|
|
|
|
# --- Step each sheep ---
|
|
for i in range(self.n_sheep):
|
|
self._step_one_sheep(i)
|
|
|
|
# --- Update penned state ---
|
|
for i in range(self.n_sheep):
|
|
if (not self.sheep_penned[i]
|
|
and is_penned_position(self.sheep_x[i], self.sheep_y[i])):
|
|
self.sheep_penned[i] = True
|
|
|
|
# --- Run LiDAR perception on this step's state (after sheep have
|
|
# moved). Updates the tracker that obs and the imitation-
|
|
# reward teacher consume. Reward / termination still use GT. ---
|
|
if self._tracker is not None:
|
|
self._update_tracker()
|
|
|
|
# --- Reward, termination ---
|
|
d_pen, radius = self._flock_metrics()
|
|
reward = self._compute_reward(d_pen, radius, action=action)
|
|
self.prev_d_pen = d_pen
|
|
self.prev_radius = radius
|
|
self.prev_n_penned = int(self.sheep_penned.sum())
|
|
|
|
self.steps += 1
|
|
all_penned = bool(self.sheep_penned.all())
|
|
terminated = all_penned
|
|
truncated = self.steps >= self.max_steps
|
|
if all_penned:
|
|
reward += self.W_DONE
|
|
# No timeout penalty: a per-unpenned penalty made "do nothing"
|
|
# strictly preferable to noisy-random under reward-progress shaping
|
|
# (random sometimes pushes sheep away → negative progress, then
|
|
# always ate the timeout penalty), which collapsed exploration to
|
|
# tiny actions. The pen jackpot alone provides the directional
|
|
# signal once exploration is wide enough to find it.
|
|
|
|
obs = self._build_obs()
|
|
info = {
|
|
"n_sheep": self.n_sheep,
|
|
"n_penned": self.prev_n_penned,
|
|
"is_success": all_penned,
|
|
"steps": self.steps,
|
|
}
|
|
return obs, float(reward), terminated, truncated, info
|
|
|
|
# ---- internals ----
|
|
def _step_one_sheep(self, i: int) -> None:
|
|
x, y = float(self.sheep_x[i]), float(self.sheep_y[i])
|
|
peers = [(float(self.sheep_x[j]), float(self.sheep_y[j]))
|
|
for j in range(self.n_sheep) if j != i]
|
|
heading, speed_motor, new_wander = compute_heading_speed(
|
|
x, y,
|
|
penned=bool(self.sheep_penned[i]),
|
|
dog_xy=(self.dog_x, self.dog_y),
|
|
peers=peers,
|
|
wander_angle=float(self.sheep_wander[i]),
|
|
rng=self._py_rng,
|
|
)
|
|
self.sheep_wander[i] = new_wander
|
|
|
|
wL, wR = heading_speed_to_wheels(
|
|
heading, speed_motor, float(self.sheep_h[i]),
|
|
max_wheel_omega=SHEEP_MAX_WHEEL_OMEGA, k_turn=4.0,
|
|
)
|
|
nx, ny, nh = kinematics_step(
|
|
x, y, float(self.sheep_h[i]), wL, wR,
|
|
SHEEP_WHEEL_RADIUS, SHEEP_WHEEL_BASE, WEBOTS_DT,
|
|
)
|
|
|
|
# Wall clipping — matches Webots stone walls, except in the gate column
|
|
# where the south wall is absent.
|
|
nx = float(np.clip(nx, FIELD_X[0] + 0.2, FIELD_X[1] - 0.2))
|
|
in_gate_col = PEN_X[0] <= nx <= PEN_X[1]
|
|
if in_gate_col:
|
|
ny = float(np.clip(ny, PEN_Y[0] + 0.2, FIELD_Y[1] - 0.2))
|
|
else:
|
|
ny = float(np.clip(ny, FIELD_Y[0] + 0.2, FIELD_Y[1] - 0.2))
|
|
|
|
self.sheep_x[i] = nx
|
|
self.sheep_y[i] = ny
|
|
self.sheep_h[i] = nh
|
|
|
|
def _flock_metrics(self):
|
|
"""(per-sheep mean distance to pen entry, max-radius).
|
|
|
|
Using the per-sheep mean instead of CoM-distance ensures stragglers
|
|
keep contributing to the progress signal — the dog can't game the
|
|
shaping by herding the bulk of the flock and abandoning one
|
|
outlier (CoM moves toward pen, but mean-distance doesn't).
|
|
"""
|
|
active_mask = ~self.sheep_penned
|
|
if not active_mask.any():
|
|
return 0.0, 0.0
|
|
xs = self.sheep_x[active_mask]
|
|
ys = self.sheep_y[active_mask]
|
|
per_sheep_d = np.hypot(xs - PEN_ENTRY[0], ys - PEN_ENTRY[1])
|
|
d_pen = float(per_sheep_d.mean())
|
|
com_x, com_y = float(xs.mean()), float(ys.mean())
|
|
if active_mask.sum() == 1:
|
|
radius = 0.0
|
|
else:
|
|
radius = float(np.hypot(xs - com_x, ys - com_y).max())
|
|
return d_pen, radius
|
|
|
|
def _compute_reward(self, d_pen: float, radius: float, action=None) -> float:
|
|
"""Sparse + per-sheep distance shaping + Strömbom imitation.
|
|
|
|
d_pen is the *mean* distance over active sheep, so progress only
|
|
accrues when ALL active sheep get closer to the pen on average —
|
|
the dog can't farm it by herding one sheep while ignoring others.
|
|
|
|
The imitation term is computed by querying Strömbom for the
|
|
recommended action at the *current* (post-step) state and
|
|
rewarding cosine similarity with what the policy actually did.
|
|
"""
|
|
n_penned = int(self.sheep_penned.sum())
|
|
delta_pen = n_penned - self.prev_n_penned
|
|
|
|
d_progress = max(-5.0, min(5.0, self.prev_d_pen - d_pen))
|
|
r = self.W_PEN_DELTA * delta_pen + self.W_PROGRESS * d_progress
|
|
|
|
if action is not None and self.W_IMITATE > 0.0:
|
|
positions = self._perceived_positions()
|
|
if positions:
|
|
sx, sy, _mode = strombom_action(
|
|
(self.dog_x, self.dog_y), positions, PEN_ENTRY,
|
|
)
|
|
a_norm = math.hypot(float(action[0]), float(action[1]))
|
|
s_norm = math.hypot(sx, sy)
|
|
if a_norm > 1e-3 and s_norm > 1e-3:
|
|
cos_sim = (float(action[0]) * sx + float(action[1]) * sy) / (a_norm * s_norm)
|
|
r += self.W_IMITATE * cos_sim
|
|
|
|
return float(r)
|
|
|
|
def _build_single_obs(self) -> np.ndarray:
|
|
if self._tracker is not None:
|
|
# Obs sees only the tracker's active set; penned tracks are
|
|
# intentionally excluded (matches the prior receiver-based
|
|
# behaviour where penned sheep stopped contributing to the
|
|
# symbolic obs).
|
|
active = self._tracker.get_positions()
|
|
sheep_xy_list = list(active.values())
|
|
sheep_penned_list = [False] * len(sheep_xy_list)
|
|
else:
|
|
sheep_xy_list = list(zip(self.sheep_x.tolist(), self.sheep_y.tolist()))
|
|
sheep_penned_list = self.sheep_penned.tolist()
|
|
return build_obs(
|
|
(self.dog_x, self.dog_y), self.dog_heading,
|
|
sheep_xy_list, sheep_penned_list,
|
|
n_max=self._max_n_sheep,
|
|
)
|
|
|
|
def _build_obs(self) -> np.ndarray:
|
|
single = self._build_single_obs()
|
|
if self._frame_stack <= 1:
|
|
return single
|
|
# On a fresh reset the buffer is empty — duplicate the first
|
|
# frame so the stack is always full-length.
|
|
if not self._frame_buffer:
|
|
self._frame_buffer = [single.copy() for _ in range(self._frame_stack)]
|
|
else:
|
|
self._frame_buffer.append(single)
|
|
if len(self._frame_buffer) > self._frame_stack:
|
|
self._frame_buffer = self._frame_buffer[-self._frame_stack:]
|
|
# Concatenate oldest → newest.
|
|
return np.concatenate(self._frame_buffer, axis=0).astype(np.float32)
|
|
|
|
# ------------------------------------------------------------------
|
|
# LiDAR perception helpers
|
|
# ------------------------------------------------------------------
|
|
def _all_sheep_xy(self) -> list[tuple[float, float]]:
|
|
"""Every sheep, including penned ones (the LiDAR sees them)."""
|
|
return [(float(self.sheep_x[i]), float(self.sheep_y[i]))
|
|
for i in range(self.n_sheep)]
|
|
|
|
def _update_tracker(self) -> None:
|
|
ranges = simulate_scan(
|
|
self.dog_x, self.dog_y, self.dog_heading,
|
|
self._all_sheep_xy(),
|
|
rng=self._np_rng_lidar,
|
|
)
|
|
detections = detections_from_scan(
|
|
ranges, self.dog_x, self.dog_y, self.dog_heading,
|
|
)
|
|
self._tracker.update(detections)
|
|
|
|
def perceived_positions(self) -> dict[str, tuple[float, float]]:
|
|
"""Public accessor — what the controller would 'see' this step.
|
|
|
|
LiDAR mode → the tracker's active set.
|
|
Privileged mode → ground-truth active sheep.
|
|
|
|
Used by ``training.eval`` and ``tools.collect_demos`` so analytic
|
|
teachers run on the same perception the deployed controller has.
|
|
"""
|
|
if self._tracker is not None:
|
|
return self._tracker.get_positions()
|
|
return {f"s{i}": (float(self.sheep_x[i]), float(self.sheep_y[i]))
|
|
for i in range(self.n_sheep) if not self.sheep_penned[i]}
|
|
|
|
# Internal alias so the imitation reward path doesn't need to know
|
|
# which mode it's in.
|
|
_perceived_positions = perceived_positions
|