"""Shepherd Dog controller (Webots). Mode is selected by ``HERDING_MODE`` (env var, or via the ``herding_runtime.cfg`` file the launcher writes since Webots strips env vars on some setups): strombom → canonical Strömbom collect/drive heuristic. sequential → single-target "pin and push" — drives the sheep closest to the pen. bc → behaviour-cloned MLP, trained on Strömbom demos via sim. Default policy directory: training/runs/bc_v3. rl → KL-regularised PPO fine-tune of the BC policy. Same obs/action space as bc; refines time-to-pen via environment reward while staying anchored to bc. Default policy directory: training/runs/rl_v1. dagger → DAgger data collection. Reads sheep ground-truth via the receiver, computes the active-scan teacher's recommended action at every step, drives with either the teacher (HERDING_DAGGER_DRIVER=teacher, default) or the loaded student (=student), and logs each (lidar_stacked_obs, teacher_action) pair. On exit dumps to ``training/dagger/dagger_.npz`` for ``tools.dagger_merge_train`` to consume. Sheep perception ---------------- The dog now perceives sheep through its **front-mounted 140° LiDAR** (``protos/ShepherdDog.proto``: 180 rays, 12 m max range). Each step the controller: 1. Reads ``lidar.getRangeImage()``. 2. Runs ``herding.lidar_perception.detections_from_scan`` to cluster returns into world-frame ``(x, y)`` sheep estimates. 3. Folds those into a ``herding.sheep_tracker.SheepTracker`` which maintains last-seen positions for sheep currently out of the FOV and latches "penned" once a track disappears near the gate. The output of step 3 is a ``{name: (x, y)}`` dict shaped exactly like the receiver-based one we used to consume — so Strömbom, Sequential and the BC obs builder run unchanged. The sheep→dog Emitter/Receiver link is still up (kept passively for compatibility) but its messages are *not* used for control. All modes share the same low-level differential-drive controller (``herding.diffdrive.velocity_to_wheels`` with cos(err)-clamped forward speed), so switching modes does not retune actuation. A safety supervisor enforces the "dog stays out of the pen" invariant: if the action would push the dog past ``DOG_SOUTH_LIMIT`` it is overridden with a north-driving correction. RL fallback: if the policy zip can't be loaded (SB3 missing, file missing), the controller drops to strombom mode automatically. """ import math import os import sys # --- Make the shared herding/ package importable from this controller dir --- _HERE = os.path.dirname(os.path.abspath(__file__)) _PROJECT_ROOT = os.path.normpath(os.path.join(_HERE, "..", "..")) if _PROJECT_ROOT not in sys.path: sys.path.insert(0, _PROJECT_ROOT) from controller import Robot from herding.active_scan import ActiveScanTeacher from herding.control import modulate_speed_near_sheep from herding.diffdrive import velocity_to_wheels from herding.geometry import ( DOG_MAX_LINEAR, DOG_MAX_WHEEL_OMEGA, DOG_SOUTH_LIMIT, DOG_WHEEL_RADIUS, PEN_ENTRY, is_penned_position, ) from herding.lidar_perception import detections_from_scan from herding.obs import OBS_DIM, build_obs from herding.sequential import compute_action_debug as sequential_action_debug from herding.sheep_tracker import SheepTracker from herding.strombom import compute_action as strombom_action from herding.strombom import compute_action_debug as strombom_action_debug # --------------------------------------------------------------------------- # Mode selection # --------------------------------------------------------------------------- def _load_runtime_config(): """Read mode + policy_dir overrides from a runtime config file. Webots strips HERDING_* env vars in some configurations, so the launcher writes a tiny ``herding_runtime.cfg`` (key=value lines) in the project root and the controller reads it here. Env vars win if both are present; the file is the fallback. """ cfg_path = os.path.join(_PROJECT_ROOT, "herding_runtime.cfg") if not os.path.exists(cfg_path): return {} out = {} try: with open(cfg_path) as f: for line in f: line = line.strip() if not line or line.startswith("#") or "=" not in line: continue k, _, v = line.partition("=") out[k.strip().upper()] = v.strip() except OSError: return {} return out _runtime_cfg = _load_runtime_config() MODE = (os.environ.get("HERDING_MODE") or _runtime_cfg.get("HERDING_MODE") or "bc").lower() def _resolve_policy_dir(mode: str) -> str: """Where to look for the trained policy for the given mode. Priority: 1. HERDING_POLICY_DIR env var or runtime-cfg entry, if it points to a real directory. 2. Mode-specific default: bc → training/runs/bc_v3 (Strömbom-imitated MLP) rl → training/runs/rl_v1 (KL-PPO fine-tune of bc_v3) 3. Fall back to bc_v3. All checkpoints are frame-stacked K = 4; ``policy_loader`` reads the stacking factor from the policy's observation space. """ env_dir = (os.environ.get("HERDING_POLICY_DIR") or _runtime_cfg.get("HERDING_POLICY_DIR")) if env_dir and os.path.isdir(env_dir): return env_dir mode_default = { "bc": os.path.join(_PROJECT_ROOT, "training", "runs", "bc_v3"), "rl": os.path.join(_PROJECT_ROOT, "training", "runs", "rl_v1"), "dagger": os.path.join(_PROJECT_ROOT, "training", "runs", "bc_v3"), } primary = mode_default.get(mode, mode_default["bc"]) if os.path.isdir(primary): return primary # Fall back to BC if the requested checkpoint isn't there yet # (e.g., user asked for `rl` before training the fine-tune). fallback = mode_default["bc"] if os.path.isdir(fallback): return fallback return env_dir or primary _VALID_MODES = ("bc", "rl", "strombom", "sequential", "dagger", "diag") # Back-compat: an old config saying HERDING_MODE=rl meant "the BC policy". # We now use `rl` strictly for the KL-PPO fine-tune. If the rl_v1 # directory isn't present, _resolve_policy_dir below silently falls # back to bc_v3, preserving the old behaviour. if MODE not in _VALID_MODES: print(f"[dog] unknown HERDING_MODE={MODE!r}; defaulting to strombom.") MODE = "strombom" DAGGER_DRIVER = (os.environ.get("HERDING_DAGGER_DRIVER") or _runtime_cfg.get("HERDING_DAGGER_DRIVER") or "teacher").lower() if DAGGER_DRIVER not in ("teacher", "student"): DAGGER_DRIVER = "teacher" POLICY_DIR = _resolve_policy_dir(MODE) policy_handle = None if MODE in ("bc", "rl", "dagger"): print(f"[dog] resolved POLICY_DIR={POLICY_DIR} exists={os.path.isdir(POLICY_DIR)}") try: from policy_loader import load as _load_policy policy_handle = _load_policy(POLICY_DIR) print(f"[dog] policy loaded from {POLICY_DIR}") except Exception as exc: if MODE in ("bc", "rl"): print(f"[dog] policy load failed ({exc!r}); falling back to strombom.") MODE = "strombom" else: # In dagger mode, no policy is fine if driver=teacher. print(f"[dog] policy load failed ({exc!r}); dagger driver forced to teacher.") policy_handle = None print(f"[dog] running in mode={MODE}" + (f" driver={DAGGER_DRIVER}" if MODE == "dagger" else "")) # --------------------------------------------------------------------------- # Action smoothing + safety supervisor # --------------------------------------------------------------------------- ACTION_SMOOTH = 0.55 # was 0.35; bumped for less frame-to-frame action jitter prev_action = (0.0, 0.0) def safety_clamp(vx: float, vy: float, dog_x: float, dog_y: float) -> tuple: """If the dog is near the south barrier and the action would push it further south, override with a northward action. Hard invariant: the dog never enters the pen.""" if dog_y < DOG_SOUTH_LIMIT and vy < 0.0: return (0.0, 1.0) if dog_y < DOG_SOUTH_LIMIT + 0.5 and vy < -0.2: return (vx * 0.5, max(0.0, vy + 0.5)) return (vx, vy) # --------------------------------------------------------------------------- # Driving # --------------------------------------------------------------------------- def drive(vx: float, vy: float, left_motor, right_motor, compass, motor_max: float): if math.hypot(vx, vy) < 1e-3: left_motor.setVelocity(0.0) right_motor.setVelocity(0.0) return n = compass.getValues() h = math.atan2(n[0], n[1]) left, right = velocity_to_wheels( vx, vy, h, max_linear=DOG_MAX_LINEAR, wheel_radius=DOG_WHEEL_RADIUS, max_wheel_omega=motor_max, k_turn=4.0, ) left_motor.setVelocity(left) right_motor.setVelocity(right) # --------------------------------------------------------------------------- # Webots devices # --------------------------------------------------------------------------- robot = Robot() timestep = int(robot.getBasicTimeStep()) left_motor = robot.getDevice("left wheel motor") right_motor = robot.getDevice("right wheel motor") left_motor.setPosition(float("inf")) right_motor.setPosition(float("inf")) left_motor.setVelocity(0.0) right_motor.setVelocity(0.0) MOTOR_MAX = min(left_motor.getMaxVelocity(), DOG_MAX_WHEEL_OMEGA) gps = robot.getDevice("gps"); gps.enable(timestep) compass = robot.getDevice("compass"); compass.enable(timestep) receiver = robot.getDevice("receiver"); receiver.enable(timestep) emitter = robot.getDevice("emitter") lidar = robot.getDevice("lidar"); lidar.enable(timestep) # The receiver channel from sheep is no longer consumed for perception # (kept enabled in case any peripheral tooling reads it). Sheep # positions come exclusively from the LiDAR + tracker pipeline below. tracker = SheepTracker() # Cosmetic ear motors — ignored by control logic but keep them animated. left_ear = robot.getDevice("left ear motor") right_ear = robot.getDevice("right ear motor") left_ear.setPosition(float("inf")) right_ear.setPosition(float("inf")) left_ear.setVelocity(0.0) right_ear.setVelocity(0.0) ear_phase = 0.0 EAR_AMPLITUDE = 0.35 EAR_RATE = 8.0 # --------------------------------------------------------------------------- # Main loop # --------------------------------------------------------------------------- # Active sheep positions come from the LiDAR-fed tracker each step; # penned_set is the tracker's ``get_penned_set()`` call. We drain the # receiver queue without consuming it, so the small backlog of sheep # pings can't grow unbounded. step_count = 0 import atexit import time import numpy as _np # DAgger state ---------------------------------------------------------- # Logged each step in dagger mode: (stacked_lidar_obs, teacher_action). DAGGER_LOG_OBS: list = [] DAGGER_LOG_ACT: list = [] # Diagnostic mode buffer (one dict per step). DIAG_BUF: list = [] # Frame stack buffer the controller maintains itself when dagger mode is # active — the stacked obs we log must match what the policy sees so the # downstream BC consumes (stacked_obs, teacher_action) pairs cleanly. _FRAME_STACK = (policy_handle.frame_stack if policy_handle is not None else 4) _dagger_buffer: list = [] # Active-scan teacher operates on GT (read from receiver). _dagger_teacher = ActiveScanTeacher(strombom_action) if MODE == "dagger" else None # GT positions accumulated from the receiver (sheep emit their xy each step). _gt_sheep: dict = {} _DAGGER_RUN_TS = int(time.time()) # one file per controller run _DAGGER_DUMPED = False # Sentinel that the auto-collection script polls — empty file written # when this controller decides the run is "done" (all sheep penned, by # GT). The launcher then kills Webots and moves on without waiting out # its timeout. Honoured only in dagger mode. _DAGGER_DONE_FILE = os.path.join(_PROJECT_ROOT, "training", "dagger", ".DONE") def _dump_dagger_log(): """Save accumulated (obs, teacher_action) pairs to disk on exit. Webots may SIGKILL the controller, so the loop also calls this every DAGGER_FLUSH_STEPS so we lose at most a few seconds of data per run. Idempotent — repeated calls overwrite the same file with the latest accumulated buffer. """ global _DAGGER_DUMPED if MODE != "dagger" or not DAGGER_LOG_OBS: return out_dir = os.path.join(_PROJECT_ROOT, "training", "dagger") os.makedirs(out_dir, exist_ok=True) out_path = os.path.join(out_dir, f"dagger_{_DAGGER_RUN_TS}.npz") obs_arr = _np.stack(DAGGER_LOG_OBS).astype(_np.float32) act_arr = _np.stack(DAGGER_LOG_ACT).astype(_np.float32) _np.savez(out_path, obs=obs_arr, actions=act_arr) if not _DAGGER_DUMPED: print(f"[dog dagger] wrote {len(DAGGER_LOG_OBS)} pairs → {out_path}") _DAGGER_DUMPED = True DAGGER_FLUSH_STEPS = 500 atexit.register(_dump_dagger_log) while robot.step(timestep) != -1: step_count += 1 # Drain receiver. In every mode we capture GT for the diagnostic # log line — perception still comes from LiDAR, the GT is read-only. while receiver.getQueueLength() > 0: msg = receiver.getString() receiver.nextPacket() parts = msg.split(":") if len(parts) == 4 and parts[0] == "sheep": try: _gt_sheep[parts[1]] = (float(parts[2]), float(parts[3])) except ValueError: pass pos = gps.getValues() dog_xy = (pos[0], pos[1]) n = compass.getValues() dog_heading = math.atan2(n[0], n[1]) # ---- LiDAR perception → tracker → sheep_positions dict ---- ranges = _np.asarray(lidar.getRangeImage(), dtype=_np.float32) detections = detections_from_scan(ranges, dog_xy[0], dog_xy[1], dog_heading) sheep_positions = tracker.update(detections) penned_set = tracker.get_penned_set() # ---- Diagnostic mode: dump the first DIAG_STEPS scans + GT to disk. if MODE == "diag": DIAG_STEPS = 80 if step_count <= DIAG_STEPS: DIAG_BUF.append(dict( step=step_count, ranges=ranges.copy(), dog_x=dog_xy[0], dog_y=dog_xy[1], dog_h=dog_heading, gt_sheep=dict(_gt_sheep), detections=list(detections), )) if step_count == DIAG_STEPS: _diag_path = os.path.join(_PROJECT_ROOT, "training", "dagger", f"diag_{int(time.time())}.npz") os.makedirs(os.path.dirname(_diag_path), exist_ok=True) _np.savez( _diag_path, ranges=_np.stack([d["ranges"] for d in DIAG_BUF]), dog_xy=_np.array([[d["dog_x"], d["dog_y"]] for d in DIAG_BUF], dtype=_np.float32), dog_h=_np.array([d["dog_h"] for d in DIAG_BUF], dtype=_np.float32), # Per-step GT serialised: max-pad to 10 sheep. gt_xy=_np.array([ [list(d["gt_sheep"].get(f"sheep{i}", (1e9, 1e9))) for i in range(1, 11)] for d in DIAG_BUF ], dtype=_np.float32), detections=_np.array([ len(d["detections"]) for d in DIAG_BUF ], dtype=_np.int32), ) print(f"[dog diag] wrote {DIAG_STEPS} scans → {_diag_path}") # Build the single-frame LiDAR obs (matches what the env produces). sheep_xy_list = list(sheep_positions.values()) sheep_penned_list = [False] * len(sheep_xy_list) single_obs = build_obs(dog_xy, dog_heading, sheep_xy_list, sheep_penned_list) # Maintain our own frame stack so logged obs == what policy sees. if not _dagger_buffer: _dagger_buffer = [single_obs.copy() for _ in range(_FRAME_STACK)] else: _dagger_buffer.append(single_obs) if len(_dagger_buffer) > _FRAME_STACK: _dagger_buffer = _dagger_buffer[-_FRAME_STACK:] stacked_obs = _np.concatenate(_dagger_buffer, axis=0).astype(_np.float32) # ---- Action selection ---- if MODE == "diag": # Diagnostic mode: rotate in place so the captured scans cover # all 360° of view from one position. Target = heading + π → # cos(err) clamps forward to ~0, the dog spins. _t = dog_heading + math.pi vx, vy = math.cos(_t), math.sin(_t) elif MODE == "dagger": # Teacher: active-scan + Strömbom on GT (active sheep only). gt_active = {name: xy for name, xy in _gt_sheep.items() if not is_penned_position(xy[0], xy[1])} t_vx, t_vy, _mode_str = _dagger_teacher( dog_xy, dog_heading, gt_active, PEN_ENTRY, ) # Student (if a policy is loaded). s_vx, s_vy = None, None if policy_handle is not None: action = policy_handle.predict(stacked_obs) s_vx, s_vy = float(action[0]), float(action[1]) # Drive selection. if DAGGER_DRIVER == "student" and policy_handle is not None: vx, vy = s_vx, s_vy else: vx, vy = t_vx, t_vy # Always log the teacher action (this is the supervision signal). DAGGER_LOG_OBS.append(stacked_obs.copy()) DAGGER_LOG_ACT.append(_np.array([t_vx, t_vy], dtype=_np.float32)) elif MODE in ("bc", "rl") and policy_handle is not None: # Pass the single-frame obs; the policy_loader maintains its own # frame stack internally. Both bc and rl use the same control # interface — the only difference is which checkpoint loaded. action = policy_handle.predict(single_obs) vx, vy = float(action[0]), float(action[1]) elif MODE in ("strombom", "sequential"): # Wrap the analytic teacher in ActiveScanTeacher so the dog # rotates / walks-to-centre when the tracker briefly empties, # instead of going idle. Without this wrapper, the first 2 s # of LiDAR-blind operation kills the run because Strömbom and # Sequential both return (0, 0) when there are no positions. if "_analytic_teacher" not in globals(): from herding.sequential import compute_action as sequential_action _analytic_teacher = ActiveScanTeacher( strombom_action if MODE == "strombom" else sequential_action ) vx, vy, _mode_str = _analytic_teacher( dog_xy, dog_heading, sheep_positions, PEN_ENTRY, ) # Shared post-process: speed modulation near sheep. Applies to bc, # rl, strombom, sequential — every mode where the action source is # nominally unit-magnitude. In dagger mode the active-scan teacher # has already modulated, and the diag mode action is hand-built for # rotation; both skip. if MODE not in ("dagger", "diag"): vx, vy = modulate_speed_near_sheep(vx, vy, dog_xy, sheep_positions) # EMA smoothing — reduces oscillation from policy or Strömbom flips. vx = ACTION_SMOOTH * prev_action[0] + (1.0 - ACTION_SMOOTH) * vx vy = ACTION_SMOOTH * prev_action[1] + (1.0 - ACTION_SMOOTH) * vy # Safety: dog must never enter the pen. vx, vy = safety_clamp(vx, vy, dog_xy[0], dog_xy[1]) prev_action = (vx, vy) drive(vx, vy, left_motor, right_motor, compass, MOTOR_MAX) emitter.send(f"dog:{dog_xy[0]:.4f}:{dog_xy[1]:.4f}") # Cosmetic ear wiggle — purely visual. ear_phase += 0.12 ear_pos = EAR_AMPLITUDE * math.sin(ear_phase) left_ear.setVelocity(EAR_RATE) right_ear.setVelocity(EAR_RATE) left_ear.setPosition(ear_pos) right_ear.setPosition(-ear_pos) # --- DAgger: early-stop when all GT sheep are penned --- if MODE == "dagger" and _gt_sheep: gt_active_count = sum(1 for x, y in _gt_sheep.values() if not is_penned_position(x, y)) if gt_active_count == 0 and not os.path.exists(_DAGGER_DONE_FILE): _dump_dagger_log() open(_DAGGER_DONE_FILE, "w").close() print(f"[dog dagger] all {len(_gt_sheep)} sheep penned — " f"wrote {_DAGGER_DONE_FILE}, exiting early") if MODE == "dagger" and step_count % DAGGER_FLUSH_STEPS == 0 and DAGGER_LOG_OBS: _dump_dagger_log() if step_count % 200 == 0: gt_penned = sum(1 for x, y in _gt_sheep.values() if is_penned_position(x, y)) gt_total = len(_gt_sheep) extra = "" if MODE == "dagger": extra = f" logged={len(DAGGER_LOG_OBS)}" print(f"[dog mode={MODE}] step={step_count} " f"GT_penned={gt_penned}/{gt_total} " f"tracks_active={tracker.n_active()} " f"tracks_penned={tracker.n_penned()} " f"detections={len(detections)} action=({vx:+.2f}, {vy:+.2f}){extra}") # Loop ended (Webots told us to quit). Flush any remaining DAgger log. _dump_dagger_log()