Checkpoint 8
This commit is contained in:
+6
-20
@@ -1,4 +1,9 @@
|
||||
# Training pipeline
|
||||
# Training and Evaluation Details
|
||||
|
||||
This file is the command-level companion to the root README. It focuses
|
||||
on data collection, BC, PPO fine-tuning, evaluation flags, and generated
|
||||
artifacts; use the root README for the high-level architecture and
|
||||
Webots demo quick start.
|
||||
|
||||
Two stages, strictly sequential:
|
||||
|
||||
@@ -26,16 +31,6 @@ runs/ — checkpoints (whitelisted entries in top-level .gitignore)
|
||||
run with ``python -m pytest tests/``.)
|
||||
```
|
||||
|
||||
## Setup
|
||||
|
||||
```
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
CPU is the default and recommended device — SB3 PPO with an MLP policy
|
||||
of this size runs faster on CPU than GPU because the bottleneck is
|
||||
rollout collection, not gradient compute.
|
||||
|
||||
## End-to-end pipeline
|
||||
|
||||
The simplest way to run everything is the Makefile at the project
|
||||
@@ -93,12 +88,3 @@ python -m training.eval --policy strombom --max-flock 10 --max-steps 15000 --
|
||||
python -m training.eval --policy sequential --max-flock 10 --max-steps 15000 --n-seeds 10
|
||||
```
|
||||
|
||||
## Webots inference
|
||||
|
||||
```
|
||||
tools/run_webots.sh 10 bc # or rl, strombom, sequential
|
||||
```
|
||||
|
||||
The dog controller loads `runs/bc` for `bc` mode and `runs/rl` for
|
||||
`rl` mode. Override with `HERDING_POLICY_DIR=…` for a specific
|
||||
checkpoint.
|
||||
|
||||
+82
-15
@@ -15,51 +15,102 @@ Usage::
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
|
||||
# Early CLI parse so we can configure geometry before heavy imports.
|
||||
# (argparse is used again below for the full parse; this is a lightweight
|
||||
# pre-pass that only reads --world.)
|
||||
_pre_argv = [a for a in os.sys.argv[1:]]
|
||||
_pre_world = None
|
||||
for i, a in enumerate(_pre_argv):
|
||||
if a == "--world" and i + 1 < len(_pre_argv):
|
||||
_pre_world = _pre_argv[i + 1]
|
||||
break
|
||||
if a.startswith("--world="):
|
||||
_pre_world = a.split("=", 1)[1]
|
||||
break
|
||||
if _pre_world is not None:
|
||||
from herding.world.geometry import configure as _geo_configure
|
||||
_geo_configure(_pre_world)
|
||||
os.environ["HERDING_WORLD"] = _pre_world
|
||||
|
||||
from herding.control.active_scan import ActiveScanTeacher
|
||||
from herding.world.geometry import PEN_ENTRY
|
||||
from herding.world.geometry import PEN_ENTRY, FIELD_SHAPE
|
||||
from herding.control.sequential import compute_action as sequential_action
|
||||
from herding.control.strombom import compute_action as strombom_action
|
||||
from herding.control.universal import compute_action as universal_action
|
||||
from training.herding_env import HerdingEnv
|
||||
|
||||
|
||||
TEACHERS = {
|
||||
"sequential": sequential_action,
|
||||
"strombom": strombom_action,
|
||||
"universal": universal_action,
|
||||
}
|
||||
|
||||
|
||||
def _call_teacher(fn, dog_xy, dog_heading, sheep_positions, pen_target,
|
||||
drive_mode="differential"):
|
||||
"""Call any teacher function and return (vx, vy, omega, mode).
|
||||
|
||||
Normalizes across 3-tuple teachers (vx, vy, mode) and 4-tuple
|
||||
universal teacher (vx, vy, omega, mode). ActiveScanTeacher (when
|
||||
invoked with drive_mode="mecanum") propagates the base teacher's
|
||||
omega — see test_active_scan_preserves_mecanum_omega.
|
||||
"""
|
||||
# The universal teacher and ActiveScanTeacher accept the extended
|
||||
# (dog_xy, heading, sheep, pen, drive_mode) signature. Older
|
||||
# teachers accept (dog_xy, sheep, pen). Detect by trying the
|
||||
# extended call first.
|
||||
try:
|
||||
result = fn(dog_xy, dog_heading, sheep_positions, pen_target,
|
||||
drive_mode)
|
||||
except TypeError:
|
||||
try:
|
||||
result = fn(dog_xy, dog_heading, sheep_positions, pen_target)
|
||||
except TypeError:
|
||||
result = fn(dog_xy, sheep_positions, pen_target)
|
||||
|
||||
if len(result) == 4:
|
||||
return result # (vx, vy, omega, mode)
|
||||
vx, vy, mode = result
|
||||
return vx, vy, 0.0, mode
|
||||
|
||||
|
||||
def collect_one(n_sheep: int, seed: int, max_steps: int, subsample: int,
|
||||
teacher_fn, frame_stack: int = 1, privileged: bool = False):
|
||||
teacher_fn, frame_stack: int = 1, privileged: bool = False,
|
||||
drive_mode: str = "differential"):
|
||||
env = HerdingEnv(n_sheep=n_sheep, max_steps=max_steps,
|
||||
difficulty=1.0, seed=seed, frame_stack=frame_stack)
|
||||
difficulty=1.0, seed=seed, frame_stack=frame_stack,
|
||||
drive_mode=drive_mode)
|
||||
obs, _ = env.reset(seed=seed)
|
||||
obs_list, action_list = [], []
|
||||
# Wrap the base teacher so it opens with a rotation and walks to
|
||||
# centre when the tracker briefly empties — matches the student.
|
||||
scan_teacher = ActiveScanTeacher(teacher_fn)
|
||||
for step in range(max_steps):
|
||||
if privileged:
|
||||
# Asymmetric variant: teacher reads ground truth while the
|
||||
# student keeps the LiDAR obs. Default off.
|
||||
positions = {f"s{i}": (float(env.sheep_x[i]), float(env.sheep_y[i]))
|
||||
for i in range(env.n_sheep) if not env.sheep_penned[i]}
|
||||
if not positions:
|
||||
break
|
||||
vx, vy, _mode = teacher_fn(
|
||||
(env.dog_x, env.dog_y), positions, PEN_ENTRY,
|
||||
vx, vy, omega, _mode = _call_teacher(
|
||||
teacher_fn, (env.dog_x, env.dog_y), env.dog_heading,
|
||||
positions, PEN_ENTRY, drive_mode,
|
||||
)
|
||||
else:
|
||||
positions = env.perceived_positions()
|
||||
vx, vy, _mode = scan_teacher(
|
||||
(env.dog_x, env.dog_y), env.dog_heading,
|
||||
positions, PEN_ENTRY,
|
||||
result = _call_teacher(
|
||||
scan_teacher, (env.dog_x, env.dog_y), env.dog_heading,
|
||||
positions, PEN_ENTRY, drive_mode,
|
||||
)
|
||||
action = np.array([vx, vy], dtype=np.float32)
|
||||
vx, vy, omega, _mode = result
|
||||
if drive_mode == "mecanum":
|
||||
action = np.array([vx, vy, omega], dtype=np.float32)
|
||||
else:
|
||||
action = np.array([vx, vy], dtype=np.float32)
|
||||
if step % subsample == 0:
|
||||
obs_list.append(obs.copy())
|
||||
action_list.append(action.copy())
|
||||
@@ -85,7 +136,7 @@ def main():
|
||||
help="Keep every Nth (obs, action) pair.")
|
||||
parser.add_argument("--keep-failures", action="store_true",
|
||||
help="Include partial-success trajectories. Default off.")
|
||||
parser.add_argument("--teacher", default="sequential",
|
||||
parser.add_argument("--teacher", default="universal",
|
||||
choices=list(TEACHERS.keys()),
|
||||
help="Which analytic teacher to demonstrate.")
|
||||
parser.add_argument("--frame-stack", type=int, default=1,
|
||||
@@ -94,9 +145,24 @@ def main():
|
||||
parser.add_argument("--privileged", action="store_true",
|
||||
help="Teacher reads ground truth instead of "
|
||||
"tracker output (asymmetric BC).")
|
||||
parser.add_argument("--drive-mode", default="differential",
|
||||
choices=["differential", "mecanum"],
|
||||
help="Drive mode for the dog robot.")
|
||||
parser.add_argument("--world", default=None,
|
||||
choices=["field", "field_round"],
|
||||
help="World shape. If not set, uses HERDING_WORLD "
|
||||
"env var or defaults to 'field'. Must be set "
|
||||
"before geometry is imported.")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Validate --world matches geometry (already configured by the
|
||||
# early pre-parse above, or by HERDING_WORLD env var).
|
||||
if args.world is not None and args.world != FIELD_SHAPE:
|
||||
print(f"[demos] WARNING: --world={args.world} but geometry is "
|
||||
f"'{FIELD_SHAPE}'. This should not happen — file a bug.")
|
||||
|
||||
teacher_fn = TEACHERS[args.teacher]
|
||||
print(f"[demos] teacher: {args.teacher}")
|
||||
print(f"[demos] teacher: {args.teacher} world: {FIELD_SHAPE}")
|
||||
|
||||
n_sheep_list = [int(x) for x in args.n_sheep_list.split(",")]
|
||||
print(f"[demos] grid: n_sheep={n_sheep_list}, seeds={args.seeds_per_n}, "
|
||||
@@ -111,6 +177,7 @@ def main():
|
||||
obs, actions, success, total_steps = collect_one(
|
||||
n, seed, args.max_steps, args.subsample, teacher_fn,
|
||||
frame_stack=args.frame_stack, privileged=args.privileged,
|
||||
drive_mode=args.drive_mode,
|
||||
)
|
||||
n_total += 1
|
||||
if success:
|
||||
|
||||
Binary file not shown.
+19
-3
@@ -35,14 +35,15 @@ from training.herding_env import HerdingEnv
|
||||
|
||||
|
||||
def build_model(net_arch_pi, net_arch_vf, log_std_init: float,
|
||||
frame_stack: int = 1):
|
||||
frame_stack: int = 1, drive_mode: str = "differential"):
|
||||
"""Build a fresh SB3 PPO solely as a vehicle for the policy weights.
|
||||
|
||||
PPO's training-loop plumbing isn't used during BC. ``frame_stack``
|
||||
must match the demo file so the env's obs space agrees with the
|
||||
recorded obs shape.
|
||||
"""
|
||||
env = DummyVecEnv([lambda: HerdingEnv(frame_stack=frame_stack)])
|
||||
env = DummyVecEnv([lambda: HerdingEnv(frame_stack=frame_stack,
|
||||
drive_mode=drive_mode)])
|
||||
model = PPO(
|
||||
"MlpPolicy", env,
|
||||
policy_kwargs=dict(
|
||||
@@ -83,6 +84,10 @@ def main():
|
||||
"term; balances against MSE.")
|
||||
parser.add_argument("--seed", type=int, default=0)
|
||||
parser.add_argument("--device", default="cpu")
|
||||
parser.add_argument("--drive-mode", default=None,
|
||||
choices=["differential", "mecanum"],
|
||||
help="Drive mode. If not set, inferred from "
|
||||
"demo action dimension (2→differential, 3→mecanum).")
|
||||
args = parser.parse_args()
|
||||
|
||||
torch.manual_seed(args.seed)
|
||||
@@ -130,8 +135,19 @@ def main():
|
||||
frame_stack = obs_dim // _SINGLE
|
||||
if frame_stack > 1:
|
||||
print(f"[bc] inferred frame_stack={frame_stack} from demo obs dim {obs_dim}")
|
||||
|
||||
# Infer drive mode from action dimension if not explicitly set.
|
||||
action_dim = actions.shape[1]
|
||||
if args.drive_mode is not None:
|
||||
drive_mode = args.drive_mode
|
||||
elif action_dim == 3:
|
||||
drive_mode = "mecanum"
|
||||
else:
|
||||
drive_mode = "differential"
|
||||
print(f"[bc] drive_mode={drive_mode} (action_dim={action_dim})")
|
||||
|
||||
model, _env = build_model(net_arch_pi, net_arch_vf, args.log_std_init,
|
||||
frame_stack=frame_stack)
|
||||
frame_stack=frame_stack, drive_mode=drive_mode)
|
||||
policy = model.policy.to(args.device)
|
||||
optimizer = optim.Adam(policy.parameters(), lr=args.lr)
|
||||
|
||||
|
||||
+45
-6
@@ -12,11 +12,28 @@ Usage::
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
from statistics import mean
|
||||
|
||||
import numpy as np
|
||||
|
||||
# Early CLI pre-parse for --world so geometry is configured before
|
||||
# other herding.* modules are imported.
|
||||
_pre_argv = [a for a in os.sys.argv[1:]]
|
||||
_pre_world = None
|
||||
for i, a in enumerate(_pre_argv):
|
||||
if a == "--world" and i + 1 < len(_pre_argv):
|
||||
_pre_world = _pre_argv[i + 1]
|
||||
break
|
||||
if a.startswith("--world="):
|
||||
_pre_world = a.split("=", 1)[1]
|
||||
break
|
||||
if _pre_world is not None:
|
||||
from herding.world.geometry import configure as _geo_configure
|
||||
_geo_configure(_pre_world)
|
||||
os.environ["HERDING_WORLD"] = _pre_world
|
||||
|
||||
from herding.world.geometry import MAX_SHEEP, PEN_ENTRY
|
||||
from herding.control.sequential import compute_action as sequential_action
|
||||
from herding.control.strombom import compute_action as strombom_action
|
||||
@@ -38,18 +55,20 @@ def rollout(env: HerdingEnv, predict_fn, max_steps: int) -> dict:
|
||||
"n_penned": int(env.sheep_penned.sum())}
|
||||
|
||||
|
||||
def make_analytic_predictor(action_fn):
|
||||
def make_analytic_predictor(action_fn, drive_mode: str = "differential"):
|
||||
"""Wrap an analytic teacher so it runs on the env's exposed
|
||||
perception (tracker in LiDAR mode, GT in privileged mode)."""
|
||||
def _predict(env, _obs):
|
||||
positions = env.perceived_positions()
|
||||
vx, vy, _mode = action_fn((env.dog_x, env.dog_y), positions, PEN_ENTRY)
|
||||
if drive_mode == "mecanum":
|
||||
return np.array([vx, vy, 0.0], dtype=np.float32)
|
||||
return np.array([vx, vy], dtype=np.float32)
|
||||
return _predict
|
||||
|
||||
|
||||
def make_strombom_predictor():
|
||||
return make_analytic_predictor(strombom_action)
|
||||
def make_strombom_predictor(drive_mode: str = "differential"):
|
||||
return make_analytic_predictor(strombom_action, drive_mode)
|
||||
|
||||
|
||||
def make_policy_predictor(model, vecnorm):
|
||||
@@ -73,13 +92,21 @@ def main():
|
||||
parser.add_argument("--difficulty", type=float, default=1.0,
|
||||
help="0 = sheep spawn near the gate (easy); "
|
||||
"1 = full field (deployment distribution).")
|
||||
parser.add_argument("--drive-mode", default="differential",
|
||||
choices=["differential", "mecanum"],
|
||||
help="Drive mode for the dog robot.")
|
||||
parser.add_argument("--world", default=None,
|
||||
choices=["field", "field_round"],
|
||||
help="World shape. If not set, uses HERDING_WORLD "
|
||||
"env var or defaults to 'field'.")
|
||||
args = parser.parse_args()
|
||||
|
||||
drive_mode = args.drive_mode
|
||||
frame_stack = 1
|
||||
if args.policy == "strombom":
|
||||
predict = make_analytic_predictor(strombom_action)
|
||||
predict = make_analytic_predictor(strombom_action, drive_mode)
|
||||
elif args.policy == "sequential":
|
||||
predict = make_analytic_predictor(sequential_action)
|
||||
predict = make_analytic_predictor(sequential_action, drive_mode)
|
||||
else:
|
||||
from stable_baselines3 import PPO
|
||||
run = Path(args.policy)
|
||||
@@ -114,6 +141,18 @@ def main():
|
||||
vecnorm.norm_reward = False
|
||||
predict = make_policy_predictor(model, vecnorm)
|
||||
|
||||
# Infer drive_mode from policy action dim if using a learned policy.
|
||||
if args.policy not in ("strombom", "sequential"):
|
||||
policy_action_dim = int(model.action_space.shape[0])
|
||||
if policy_action_dim == 2 and drive_mode == "mecanum":
|
||||
drive_mode = "differential"
|
||||
print(f"[eval] policy has 2D actions — overriding drive_mode "
|
||||
f"to differential")
|
||||
elif policy_action_dim == 3 and drive_mode == "differential":
|
||||
drive_mode = "mecanum"
|
||||
print(f"[eval] policy has 3D actions — overriding drive_mode "
|
||||
f"to mecanum")
|
||||
|
||||
print(f"{'n_sheep':>8} {'success%':>10} {'mean_steps':>12} {'mean_penned':>12}")
|
||||
print("-" * 46)
|
||||
for n in range(1, args.max_flock + 1):
|
||||
@@ -121,7 +160,7 @@ def main():
|
||||
for seed in range(args.n_seeds):
|
||||
env = HerdingEnv(n_sheep=n, max_steps=args.max_steps,
|
||||
difficulty=args.difficulty, seed=seed,
|
||||
frame_stack=frame_stack)
|
||||
frame_stack=frame_stack, drive_mode=drive_mode)
|
||||
r = rollout(env, predict, args.max_steps)
|
||||
successes.append(int(r["success"]))
|
||||
steps.append(r["steps"])
|
||||
|
||||
+105
-41
@@ -1,11 +1,12 @@
|
||||
"""Gymnasium environment for the shepherd-dog herding task.
|
||||
|
||||
Single-agent: the dog is the policy; sheep are env-controlled flocking
|
||||
agents (``herding.world.flocking_sim``). Differential-drive kinematics
|
||||
match the proto specs (``herding.world.diffdrive``) so a policy trained
|
||||
here transfers to Webots without re-tuning.
|
||||
agents (``herding.world.flocking_sim``). Kinematics match the proto specs
|
||||
(``herding.world.diffdrive``) so a policy trained here transfers to Webots
|
||||
without re-tuning.
|
||||
|
||||
* **Action**: ``Box(-1, 1, (2,))`` — desired ``(vx, vy)`` intent.
|
||||
* **Action** (differential): ``Box(-1, 1, (2,))`` — ``(vx, vy)`` intent.
|
||||
* **Action** (mecanum): ``Box(-1, 1, (3,))`` — ``(vx, vy, omega)`` intent.
|
||||
* **Observation**: ``Box(-inf, inf, (32·K,))`` from ``herding.perception.obs.build_obs``
|
||||
with optional frame stacking K (concatenated oldest → newest).
|
||||
* **Reset**: ``options["n_sheep"]`` overrides flock size; otherwise
|
||||
@@ -26,17 +27,20 @@ import numpy as np
|
||||
from gymnasium import spaces
|
||||
|
||||
from herding.world.diffdrive import (
|
||||
heading_speed_to_wheels, kinematics_step, velocity_to_wheels,
|
||||
heading_speed_to_wheels, kinematics_step,
|
||||
mecanum_kinematics_step, velocity_to_mecanum_wheels, velocity_to_wheels,
|
||||
)
|
||||
from herding.world.flocking_sim import (
|
||||
FLEE_SPEED, MAX_SPEED, WANDER_SPEED, compute_heading_speed,
|
||||
)
|
||||
from herding.world.geometry import (
|
||||
DOG_MAX_LINEAR, DOG_MAX_WHEEL_OMEGA, DOG_SOUTH_LIMIT, DOG_WHEEL_BASE,
|
||||
DOG_WHEEL_RADIUS, FIELD_X, FIELD_Y, GATE_X, MAX_SHEEP,
|
||||
DOG_MAX_LINEAR, DOG_MAX_WHEEL_OMEGA,
|
||||
DOG_SOUTH_LIMIT, DOG_WHEEL_BASE, DOG_WHEEL_BASE_X, DOG_WHEEL_BASE_Y,
|
||||
DOG_WHEEL_RADIUS, FIELD_SHAPE, FIELD_ROUND_R, FIELD_X, FIELD_Y,
|
||||
GATE_X, GATE_Y, MAX_SHEEP,
|
||||
PEN_ENTRY, PEN_X, PEN_Y,
|
||||
SHEEP_MAX_WHEEL_OMEGA, SHEEP_WHEEL_BASE, SHEEP_WHEEL_RADIUS,
|
||||
WEBOTS_DT, is_penned_position,
|
||||
WEBOTS_DT, clip_to_field, is_penned_position,
|
||||
)
|
||||
from herding.perception.lidar_perception import detections_from_scan
|
||||
from herding.perception.lidar_sim import simulate_scan
|
||||
@@ -82,6 +86,7 @@ class HerdingEnv(gym.Env):
|
||||
seed: Optional[int] = None,
|
||||
use_lidar: bool = True,
|
||||
frame_stack: int = 1,
|
||||
drive_mode: str = "differential",
|
||||
):
|
||||
super().__init__()
|
||||
# ``use_lidar=True`` (default): obs and imitation-reward teacher
|
||||
@@ -95,7 +100,14 @@ class HerdingEnv(gym.Env):
|
||||
# giving a memoryless MLP temporal context. K=1 → single frame.
|
||||
self._frame_stack = max(1, int(frame_stack))
|
||||
self._frame_buffer: list[np.ndarray] = []
|
||||
self.action_space = spaces.Box(-1.0, 1.0, shape=(2,), dtype=np.float32)
|
||||
|
||||
# Drive mode: "differential" (2-wheel) or "mecanum" (4-wheel omni).
|
||||
self._drive_mode = drive_mode.lower()
|
||||
if self._drive_mode not in ("differential", "mecanum"):
|
||||
raise ValueError(f"Unknown drive_mode: {drive_mode!r}")
|
||||
action_dim = 3 if self._drive_mode == "mecanum" else 2
|
||||
self.action_space = spaces.Box(-1.0, 1.0, shape=(action_dim,),
|
||||
dtype=np.float32)
|
||||
self._single_obs_dim = OBS_DIM
|
||||
self.observation_space = spaces.Box(
|
||||
low=-np.inf, high=np.inf,
|
||||
@@ -110,6 +122,11 @@ class HerdingEnv(gym.Env):
|
||||
# 1 = sheep spawn anywhere in the field (deployment distribution).
|
||||
self._difficulty = float(difficulty)
|
||||
self._initial_seed = seed
|
||||
self._initial_seed_used = False
|
||||
|
||||
# Env-owned RNG for wander jitter, re-seeded from np_random on reset.
|
||||
self._py_rng = random.Random()
|
||||
self._action_dim = action_dim
|
||||
|
||||
# State (initialised in reset)
|
||||
self.dog_x = self.dog_y = self.dog_heading = 0.0
|
||||
@@ -119,17 +136,14 @@ class HerdingEnv(gym.Env):
|
||||
self.sheep_penned = np.zeros(0, dtype=bool)
|
||||
self.sheep_wander = np.zeros(0, dtype=np.float32)
|
||||
|
||||
self.prev_action = np.zeros(2, dtype=np.float32)
|
||||
self.smoothed_action = np.zeros(2, dtype=np.float32)
|
||||
self.prev_action = np.zeros(self._action_dim, dtype=np.float32)
|
||||
self.smoothed_action = np.zeros(self._action_dim, dtype=np.float32)
|
||||
self.steps = 0
|
||||
self.n_sheep = 0
|
||||
self.prev_n_penned = 0
|
||||
self.prev_d_pen = 0.0
|
||||
self.prev_radius = 0.0
|
||||
|
||||
# Env-owned RNG for wander jitter, re-seeded from np_random on reset.
|
||||
self._py_rng = random.Random()
|
||||
|
||||
# --- Public knobs ---
|
||||
def set_max_n_sheep(self, value: int) -> None:
|
||||
self._max_n_sheep = int(np.clip(value, 1, MAX_SHEEP))
|
||||
@@ -149,6 +163,10 @@ class HerdingEnv(gym.Env):
|
||||
|
||||
# --- gym API ---
|
||||
def reset(self, *, seed=None, options=None):
|
||||
if (seed is None and self._initial_seed is not None
|
||||
and not self._initial_seed_used):
|
||||
seed = self._initial_seed
|
||||
self._initial_seed_used = True
|
||||
super().reset(seed=seed)
|
||||
self._py_rng.seed(int(self.np_random.integers(0, 2**31 - 1)))
|
||||
opts = options or {}
|
||||
@@ -168,16 +186,32 @@ class HerdingEnv(gym.Env):
|
||||
# Sheep spawn region linearly interpolates with difficulty:
|
||||
# 0 → small box near the gate, 1 → full field.
|
||||
d = self._difficulty
|
||||
sx_lo = 7.0 - d * 20.0
|
||||
sx_hi = 14.0 - d * 1.0
|
||||
sy_lo = -12.0 + d * 0.0
|
||||
sy_hi = -6.0 + d * 19.0
|
||||
if FIELD_SHAPE == "field_round":
|
||||
# Round field: spawn in a sector near the gate (south),
|
||||
# expanding to the full circle at difficulty=1.
|
||||
spawn_r_lo = 3.0
|
||||
spawn_r_hi = d * FIELD_ROUND_R * 0.8 + (1.0 - d) * 6.0
|
||||
# Angle spread around south (±60° at d=0, full circle at d=1).
|
||||
half_angle = math.radians(60) + d * math.radians(120)
|
||||
angle_lo = math.pi / 2.0 - half_angle # from south = -π/2
|
||||
angle_hi = math.pi / 2.0 + half_angle
|
||||
else:
|
||||
sx_lo = 7.0 - d * 20.0
|
||||
sx_hi = 14.0 - d * 1.0
|
||||
sy_lo = -12.0 + d * 0.0
|
||||
sy_hi = -6.0 + d * 19.0
|
||||
|
||||
sxs, sys_, shs, sws = [], [], [], []
|
||||
for _ in range(self.n_sheep):
|
||||
for _try in range(100):
|
||||
sx = float(self.np_random.uniform(sx_lo, sx_hi))
|
||||
sy = float(self.np_random.uniform(sy_lo, sy_hi))
|
||||
if FIELD_SHAPE == "field_round":
|
||||
r_spawn = float(self.np_random.uniform(spawn_r_lo, spawn_r_hi))
|
||||
a_spawn = float(self.np_random.uniform(angle_lo, angle_hi))
|
||||
sx = r_spawn * math.cos(a_spawn)
|
||||
sy = -r_spawn * math.sin(a_spawn)
|
||||
else:
|
||||
sx = float(self.np_random.uniform(sx_lo, sx_hi))
|
||||
sy = float(self.np_random.uniform(sy_lo, sy_hi))
|
||||
# Reject if too close to the dog or another sheep, or
|
||||
# already in the gate column (would start "penned").
|
||||
if math.hypot(sx - self.dog_x, sy - self.dog_y) < 3.0:
|
||||
@@ -198,8 +232,8 @@ class HerdingEnv(gym.Env):
|
||||
self.sheep_wander = np.asarray(sws, dtype=np.float32)
|
||||
self.sheep_penned = np.zeros(self.n_sheep, dtype=bool)
|
||||
|
||||
self.prev_action = np.zeros(2, dtype=np.float32)
|
||||
self.smoothed_action = np.zeros(2, dtype=np.float32)
|
||||
self.prev_action = np.zeros(self._action_dim, dtype=np.float32)
|
||||
self.smoothed_action = np.zeros(self._action_dim, dtype=np.float32)
|
||||
self.steps = 0
|
||||
self.prev_n_penned = 0
|
||||
self.prev_d_pen, self.prev_radius = self._flock_metrics()
|
||||
@@ -225,25 +259,46 @@ class HerdingEnv(gym.Env):
|
||||
)
|
||||
self.prev_action = self.smoothed_action.copy()
|
||||
vx, vy = float(self.smoothed_action[0]), float(self.smoothed_action[1])
|
||||
omega = float(self.smoothed_action[2]) if self._action_dim >= 3 else 0.0
|
||||
|
||||
# Safety supervisor — dog stays north of the gate.
|
||||
if self.dog_y < DOG_SOUTH_LIMIT and vy < 0.0:
|
||||
vx, vy = 0.0, 1.0
|
||||
|
||||
# Step the dog.
|
||||
wL, wR = velocity_to_wheels(
|
||||
vx, vy, self.dog_heading,
|
||||
max_linear=DOG_MAX_LINEAR,
|
||||
wheel_radius=DOG_WHEEL_RADIUS,
|
||||
max_wheel_omega=DOG_MAX_WHEEL_OMEGA,
|
||||
k_turn=4.0,
|
||||
)
|
||||
self.dog_x, self.dog_y, self.dog_heading = kinematics_step(
|
||||
self.dog_x, self.dog_y, self.dog_heading,
|
||||
wL, wR, DOG_WHEEL_RADIUS, DOG_WHEEL_BASE, WEBOTS_DT,
|
||||
)
|
||||
self.dog_x = float(np.clip(self.dog_x, FIELD_X[0] + 0.3, FIELD_X[1] - 0.3))
|
||||
self.dog_y = float(np.clip(self.dog_y, DOG_SOUTH_LIMIT, FIELD_Y[1] - 0.3))
|
||||
if self._drive_mode == "mecanum":
|
||||
w_fl, w_fr, w_rl, w_rr = velocity_to_mecanum_wheels(
|
||||
vx, vy, omega, self.dog_heading,
|
||||
max_linear=DOG_MAX_LINEAR,
|
||||
wheel_radius=DOG_WHEEL_RADIUS,
|
||||
lx=DOG_WHEEL_BASE_X / 2.0, ly=DOG_WHEEL_BASE_Y / 2.0,
|
||||
max_wheel_omega=DOG_MAX_WHEEL_OMEGA,
|
||||
k_turn=4.0,
|
||||
wheel_base=DOG_WHEEL_BASE,
|
||||
)
|
||||
self.dog_x, self.dog_y, self.dog_heading = mecanum_kinematics_step(
|
||||
self.dog_x, self.dog_y, self.dog_heading,
|
||||
w_fl, w_fr, w_rl, w_rr,
|
||||
DOG_WHEEL_RADIUS,
|
||||
DOG_WHEEL_BASE_X / 2.0, DOG_WHEEL_BASE_Y / 2.0,
|
||||
WEBOTS_DT,
|
||||
)
|
||||
else:
|
||||
wL, wR = velocity_to_wheels(
|
||||
vx, vy, self.dog_heading,
|
||||
max_linear=DOG_MAX_LINEAR,
|
||||
wheel_radius=DOG_WHEEL_RADIUS,
|
||||
max_wheel_omega=DOG_MAX_WHEEL_OMEGA,
|
||||
k_turn=4.0,
|
||||
)
|
||||
self.dog_x, self.dog_y, self.dog_heading = kinematics_step(
|
||||
self.dog_x, self.dog_y, self.dog_heading,
|
||||
wL, wR, DOG_WHEEL_RADIUS, DOG_WHEEL_BASE, WEBOTS_DT,
|
||||
)
|
||||
self.dog_x, self.dog_y = clip_to_field(self.dog_x, self.dog_y, margin=0.3)
|
||||
# Extra constraint: dog stays north of the gate area.
|
||||
if self.dog_y < DOG_SOUTH_LIMIT:
|
||||
self.dog_y = DOG_SOUTH_LIMIT
|
||||
|
||||
# Step sheep and update penned flags (GT-based).
|
||||
for i in range(self.n_sheep):
|
||||
@@ -304,13 +359,21 @@ class HerdingEnv(gym.Env):
|
||||
SHEEP_WHEEL_RADIUS, SHEEP_WHEEL_BASE, WEBOTS_DT,
|
||||
)
|
||||
|
||||
# Wall clipping (south wall absent inside the gate column).
|
||||
nx = float(np.clip(nx, FIELD_X[0] + 0.2, FIELD_X[1] - 0.2))
|
||||
in_gate_col = PEN_X[0] <= nx <= PEN_X[1]
|
||||
if in_gate_col:
|
||||
ny = float(np.clip(ny, PEN_Y[0] + 0.2, FIELD_Y[1] - 0.2))
|
||||
# Wall clipping.
|
||||
if FIELD_SHAPE == "field_round":
|
||||
in_gate_col = PEN_X[0] <= nx <= PEN_X[1]
|
||||
if in_gate_col:
|
||||
# Allow passage through the gate column (south of the wall).
|
||||
ny = float(np.clip(ny, PEN_Y[0] + 0.2, FIELD_Y[1] - 0.2))
|
||||
else:
|
||||
nx, ny = clip_to_field(nx, ny, margin=0.2)
|
||||
else:
|
||||
ny = float(np.clip(ny, FIELD_Y[0] + 0.2, FIELD_Y[1] - 0.2))
|
||||
nx = float(np.clip(nx, FIELD_X[0] + 0.2, FIELD_X[1] - 0.2))
|
||||
in_gate_col = PEN_X[0] <= nx <= PEN_X[1]
|
||||
if in_gate_col:
|
||||
ny = float(np.clip(ny, PEN_Y[0] + 0.2, FIELD_Y[1] - 0.2))
|
||||
else:
|
||||
ny = float(np.clip(ny, FIELD_Y[0] + 0.2, FIELD_Y[1] - 0.2))
|
||||
|
||||
self.sheep_x[i] = nx
|
||||
self.sheep_y[i] = ny
|
||||
@@ -374,6 +437,7 @@ class HerdingEnv(gym.Env):
|
||||
(self.dog_x, self.dog_y), self.dog_heading,
|
||||
sheep_xy_list, sheep_penned_list,
|
||||
n_max=self._max_n_sheep,
|
||||
n_expected=self.n_sheep,
|
||||
)
|
||||
|
||||
def _build_obs(self) -> np.ndarray:
|
||||
|
||||
+65
-4
@@ -20,8 +20,26 @@ Usage::
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
# Early CLI pre-parse for --world so geometry is configured before any
|
||||
# herding.* / training.* import binds geometry constants. Matches the
|
||||
# pattern used by training.bc.collect and training.eval.
|
||||
_pre_argv = [a for a in os.sys.argv[1:]]
|
||||
_pre_world = None
|
||||
for i, a in enumerate(_pre_argv):
|
||||
if a == "--world" and i + 1 < len(_pre_argv):
|
||||
_pre_world = _pre_argv[i + 1]
|
||||
break
|
||||
if a.startswith("--world="):
|
||||
_pre_world = a.split("=", 1)[1]
|
||||
break
|
||||
if _pre_world is not None:
|
||||
from herding.world.geometry import configure as _geo_configure
|
||||
_geo_configure(_pre_world)
|
||||
os.environ["HERDING_WORLD"] = _pre_world
|
||||
|
||||
import numpy as np
|
||||
import torch as th
|
||||
import torch.nn.functional as F
|
||||
@@ -38,9 +56,14 @@ from training.herding_env import HerdingEnv
|
||||
# Env factory
|
||||
# --------------------------------------------------------------------
|
||||
|
||||
def _make_env(rank: int, seed: int, frame_stack: int):
|
||||
def _make_env(rank: int, seed: int, frame_stack: int,
|
||||
drive_mode: str = "differential",
|
||||
difficulty: float = 1.0,
|
||||
max_n_sheep: int = 10):
|
||||
def _thunk():
|
||||
env = HerdingEnv(seed=seed + rank, frame_stack=frame_stack)
|
||||
env = HerdingEnv(seed=seed + rank, frame_stack=frame_stack,
|
||||
drive_mode=drive_mode, difficulty=difficulty,
|
||||
max_n_sheep=max_n_sheep)
|
||||
env = Monitor(env, info_keywords=("is_success", "n_sheep", "n_penned"))
|
||||
return env
|
||||
return _thunk
|
||||
@@ -198,13 +221,34 @@ def main() -> None:
|
||||
help="SB3 per-batch KL early-stop guard.")
|
||||
parser.add_argument("--seed", type=int, default=0)
|
||||
parser.add_argument("--device", default="cpu")
|
||||
parser.add_argument("--drive-mode", default=None,
|
||||
choices=["differential", "mecanum"],
|
||||
help="Drive mode. If not set, inferred from "
|
||||
"BC action dimension (2→differential, 3→mecanum).")
|
||||
parser.add_argument("--imitate-weight", type=float, default=None,
|
||||
help="Override env.W_IMITATE (e.g. 0.0 to drop "
|
||||
"Strömbom imitation during fine-tune).")
|
||||
parser.add_argument("--time-weight", type=float, default=None,
|
||||
help="Override env.W_TIME (e.g. -0.1 for a "
|
||||
"per-step time penalty).")
|
||||
parser.add_argument("--difficulty", type=float, default=1.0,
|
||||
help="HerdingEnv difficulty for PPO rollouts. "
|
||||
"Must match eval (1.0) to avoid train/eval "
|
||||
"distribution mismatch.")
|
||||
parser.add_argument("--max-n-sheep", type=int, default=10,
|
||||
help="Upper bound on flock size sampled each reset.")
|
||||
parser.add_argument("--world", default=None,
|
||||
choices=["field", "field_round"],
|
||||
help="World shape. If not set, uses HERDING_WORLD "
|
||||
"env var or defaults to 'field'.")
|
||||
args = parser.parse_args()
|
||||
# --world was already honoured in the early pre-parse above; here we
|
||||
# just sanity-check that the final argparse view agrees.
|
||||
if args.world is not None:
|
||||
from herding.world.geometry import FIELD_SHAPE as _CURRENT_SHAPE
|
||||
if args.world != _CURRENT_SHAPE:
|
||||
print(f"[rl] WARNING: --world={args.world} but geometry is "
|
||||
f"'{_CURRENT_SHAPE}'. File a bug.")
|
||||
|
||||
bc_zip = Path(args.bc) / "policy.zip"
|
||||
if not bc_zip.exists():
|
||||
@@ -226,9 +270,26 @@ def main() -> None:
|
||||
frame_stack = obs_dim // OBS_DIM
|
||||
print(f"[rl] BC obs dim {obs_dim} → frame_stack={frame_stack}")
|
||||
|
||||
env_fns = [_make_env(i, args.seed, frame_stack) for i in range(args.n_envs)]
|
||||
# Infer drive mode from BC action dim if not explicitly set.
|
||||
bc_action_dim = int(ref_only.action_space.shape[0])
|
||||
if args.drive_mode is not None:
|
||||
drive_mode = args.drive_mode
|
||||
elif bc_action_dim == 3:
|
||||
drive_mode = "mecanum"
|
||||
else:
|
||||
drive_mode = "differential"
|
||||
print(f"[rl] drive_mode={drive_mode} (BC action_dim={bc_action_dim})")
|
||||
|
||||
env_fns = [_make_env(i, args.seed, frame_stack, drive_mode,
|
||||
difficulty=args.difficulty,
|
||||
max_n_sheep=args.max_n_sheep)
|
||||
for i in range(args.n_envs)]
|
||||
venv = SubprocVecEnv(env_fns) if args.n_envs > 1 else DummyVecEnv(env_fns)
|
||||
eval_venv = DummyVecEnv([_make_env(99, args.seed + 999, frame_stack)])
|
||||
eval_venv = DummyVecEnv([_make_env(99, args.seed + 999, frame_stack,
|
||||
drive_mode,
|
||||
difficulty=args.difficulty,
|
||||
max_n_sheep=args.max_n_sheep)])
|
||||
print(f"[rl] difficulty={args.difficulty} max_n_sheep={args.max_n_sheep}")
|
||||
|
||||
# Reward-shaping overrides (broadcast to every env instance).
|
||||
def _broadcast(method: str, value):
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user