From 3b4c99a6c4ae13b8b2b1f00ed5af8ed3a171d53d Mon Sep 17 00:00:00 2001
From: Johnny Fernandes <up202402612@up.pt>
Date: Sun, 17 May 2026 01:12:06 +0000
Subject: [PATCH] Training pipelines auto-select mecanum-Webots preset

* training/bc/collect.py: --use-webots-preset now picks the
  drive-matched variant. Mecanum drives get HERDING_MEC_WEBOTS
  (with the Webots-calibrated strafe efficiency and bleed) so the
  collected demos reflect the imperfect physical mecanum the
  deployed policy will see. Differential drives still use
  HERDING_WEBOTS (no behaviour change there).
* training/rl/train.py: mecanum fine-tune now *unconditionally*
  applies the HERDING_MEC_WEBOTS robot config to the PPO env (the
  policy must update against the same imperfect kinematics it
  deploys on). Diff fine-tune unchanged.

To retrain a mecanum policy end-to-end against the new proto:

  python -m training.bc.collect --drive-mode mecanum --world field \
    --use-webots-preset \
    --out training/bc/demos_mecanum_field_v2.npz
  python -m training.bc.pretrain --demos training/bc/demos_mecanum_field_v2.npz \
    --out training/runs/bc_mecanum_field_v2 ...
  python -m training.rl.train --bc training/runs/bc_mecanum_field_v2 \
    --out training/runs/rl_mecanum_field_v2 \
    --drive-mode mecanum --world field --use-webots-preset

The same flow for field_round / mecanum/round.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 training/bc/collect.py | 24 +++++++++++++++++++-----
 training/rl/train.py   | 27 +++++++++++++++++++++++----
 2 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/training/bc/collect.py b/training/bc/collect.py
index 61eb127..dade3e4 100644
--- a/training/bc/collect.py
+++ b/training/bc/collect.py
@@ -176,17 +176,31 @@ def main():
         print(f"[demos] WARNING: --world={args.world} but geometry is "
               f"'{FIELD_SHAPE}'. This should not happen — file a bug.")
 
-    from herding.config import HerdingConfig, HERDING_WEBOTS, DomainRandomConfig, RobotConfig
+    from herding.config import (
+        HerdingConfig, HERDING_WEBOTS, HERDING_MEC_WEBOTS,
+        DomainRandomConfig, RobotConfig,
+    )
     if args.use_webots_preset:
-        herding_cfg = HERDING_WEBOTS.replace(
+        # Pick the drive-matched Webots preset — for mecanum we use the
+        # variant that simulates the physical-roller proto's strafe
+        # efficiency and forward bleed so the policy trains under the
+        # same imperfect kinematics it sees at deployment.
+        base = HERDING_MEC_WEBOTS if args.drive_mode == "mecanum" else HERDING_WEBOTS
+        herding_cfg = base.replace(
             domain_random=DomainRandomConfig(
                 fp_rate=args.fp_rate,
                 wheel_slip_std=args.wheel_slip_std,
             ),
-            robot=RobotConfig(action_smooth=args.action_smooth),
+            robot=RobotConfig(
+                action_smooth=args.action_smooth,
+                strafe_efficiency=base.robot.strafe_efficiency,
+                strafe_to_forward_bleed=base.robot.strafe_to_forward_bleed,
+            ),
         )
-        print(f"[demos] HERDING_WEBOTS preset + DR: fp_rate={args.fp_rate} "
-              f"action_smooth={args.action_smooth} wheel_slip_std={args.wheel_slip_std}")
+        preset_name = "HERDING_MEC_WEBOTS" if args.drive_mode == "mecanum" else "HERDING_WEBOTS"
+        print(f"[demos] {preset_name} preset + DR: fp_rate={args.fp_rate} "
+              f"action_smooth={args.action_smooth} wheel_slip_std={args.wheel_slip_std} "
+              f"strafe_eff={herding_cfg.robot.strafe_efficiency:.2f}")
     else:
         herding_cfg = None
         if args.fp_rate > 0.0 or args.action_smooth > 0.0 or args.wheel_slip_std > 0.0:
diff --git a/training/rl/train.py b/training/rl/train.py
index 3f0e50a..d57113d 100644
--- a/training/rl/train.py
+++ b/training/rl/train.py
@@ -275,19 +275,38 @@ def main() -> None:
         drive_mode = "differential"
     print(f"[rl] drive_mode={drive_mode} (BC action_dim={bc_action_dim})")
 
-    from herding.config import HerdingConfig, DomainRandomConfig, RobotConfig
+    from herding.config import (
+        HerdingConfig, HERDING_MEC_WEBOTS, DomainRandomConfig, RobotConfig,
+    )
     herding_cfg = None
-    if args.fp_rate > 0.0 or args.action_smooth > 0.0 or args.wheel_slip_std > 0.0:
+    # When fine-tuning a mecanum policy we always apply the Webots
+    # roller-hinge calibration to the gym kinematics (strafe efficiency
+    # and bleed). Without this, the RL agent updates against the
+    # textbook X-pattern and fails on deployment.
+    is_mecanum = (drive_mode == "mecanum")
+    if is_mecanum or args.fp_rate > 0.0 or args.action_smooth > 0.0 or args.wheel_slip_std > 0.0:
+        if is_mecanum:
+            base_robot = HERDING_MEC_WEBOTS.robot
+            strafe_eff = base_robot.strafe_efficiency
+            strafe_bleed = base_robot.strafe_to_forward_bleed
+        else:
+            strafe_eff = 1.0
+            strafe_bleed = 0.0
         herding_cfg = HerdingConfig(
             domain_random=DomainRandomConfig(
                 fp_rate=args.fp_rate,
                 wheel_slip_std=args.wheel_slip_std,
             ),
-            robot=RobotConfig(action_smooth=args.action_smooth),
+            robot=RobotConfig(
+                action_smooth=args.action_smooth,
+                strafe_efficiency=strafe_eff,
+                strafe_to_forward_bleed=strafe_bleed,
+            ),
         )
         print(f"[rl] domain-random: fp_rate={args.fp_rate}  "
               f"action_smooth={args.action_smooth}  "
-              f"wheel_slip_std={args.wheel_slip_std}")
+              f"wheel_slip_std={args.wheel_slip_std}  "
+              f"strafe_eff={strafe_eff:.2f}  strafe_bleed={strafe_bleed:.2f}")
 
     env_fns = [_make_env(i, args.seed, frame_stack, drive_mode,
                          difficulty=args.difficulty,