Checkpoint 8

2026-05-12 22:41:03 +01:00
parent a01a5c9cef
commit 5c2ee4bba5
31 changed files with 3189 additions and 380 deletions
@@ -9,35 +9,79 @@
 #   make eval       # 10-seed env eval of rl
 #   make test       # pytest suite
 #   make webots N=10 MODE=rl   # launch Webots in the chosen mode
+#   WEBOTS_HEADLESS=1 make webots   # no 3D view, fast mode (still needs DISPLAY or xvfb-run)
 #   make clean      # delete bc_demos and run artefacts
+#   make clean_all  # delete artefacts for all combinations
 #   make help       # print the target table
 #
 # Override any hyperparameter on the command line, for example:
 #   make rl PPO_STEPS=2000000 KL=0.02
 #   make eval EVAL_SEEDS=20
+#
+# Drive mode selects the locomotion model:
+#   make DRIVE=differential       2-wheel diff-drive (default)
+#   make DRIVE=mecanum             4-wheel omnidirectional
+#
+# World shape:
+#   make WORLD=field              rectangular (default)
+#   make WORLD=field_round        circular fence
+#
+# To train all 4 combinations:
+#   make train_all


 PY               := python

-BC_DEMOS         := training/bc/demos.npz
-BC_DIR           := training/runs/bc
-RL_DIR           := training/runs/rl
-BC_POLICY        := $(BC_DIR)/policy.zip
-RL_POLICY        := $(RL_DIR)/policy.zip
+# Drive mode and world shape — each combination gets its own artefacts.
+DRIVE            ?= differential
+WORLD            ?= field
+
+# Derived tag and paths.
+TAG               = $(DRIVE)_$(WORLD)
+BC_DEMOS          = training/bc/demos_$(TAG).npz
+BC_DIR            = training/runs/bc_$(TAG)
+RL_DIR            = training/runs/rl_$(TAG)
+BC_POLICY         = $(BC_DIR)/policy.zip
+RL_POLICY         = $(RL_DIR)/policy.zip

 # --- Demo collection ---
-TEACHER          ?= strombom
-SEEDS_PER_N      ?= 15
+TEACHER          ?= universal
+# Round field is fundamentally harder (narrow gate at south of a circle).
+# Default to more demos there to give BC a fair shot at 60%+.
+ifeq ($(WORLD),field_round)
+SEEDS_PER_N      ?= 40
+else
+SEEDS_PER_N      ?= 25
+endif
 SUBSAMPLE        ?= 3
 FRAME_STACK      ?= 4
+DEMO_MAX_STEPS   ?= 100000

 # --- Behaviour cloning ---
+ifeq ($(WORLD),field_round)
+BC_EPOCHS        ?= 100
+else
 BC_EPOCHS        ?= 60
+endif
 BC_NET_ARCH      ?= 512,512

 # --- KL-PPO fine-tune ---
-PPO_STEPS        ?= 1000000
+# Round field: longer training, looser KL, no time penalty (success
+# must be learned before speed is rewarded).
+ifeq ($(WORLD),field_round)
+PPO_STEPS        ?= 4000000
+KL               ?= 0.02
+TIME_W           ?= 0.0
+else
+PPO_STEPS        ?= 2000000
 KL               ?= 0.05
+TIME_W           ?= -0.05
+endif
+IMITATE          ?= 0.0
+# PPO rollouts at full difficulty so the training distribution matches
+# eval (deployment).  Anything lower causes a train/eval mismatch that
+# can make RL eval worse than BC.
+DIFFICULTY       ?= 1.0

 # --- Evaluation ---
 EVAL_SEEDS       ?= 10
@@ -48,16 +92,23 @@ N                ?= 10
 MODE             ?= rl


-.PHONY: all bc_demos bc rl eval test webots clean help
+.PHONY: all bc_demos bc rl eval test webots clean clean_all help \
+        train_all train_diff_rect train_diff_round \
+        train_mec_rect train_mec_round

 all: eval

+# Export HERDING_WORLD so that geometry.py picks it up at import time.
+export HERDING_WORLD = $(WORLD)
+
 bc_demos: $(BC_DEMOS)
 $(BC_DEMOS):
 	$(PY) -m training.bc.collect \
 		--teacher $(TEACHER) --out $(BC_DEMOS) \
 		--seeds-per-n $(SEEDS_PER_N) --subsample $(SUBSAMPLE) \
-		--frame-stack $(FRAME_STACK)
+		--frame-stack $(FRAME_STACK) --drive-mode $(DRIVE) \
+		--world $(WORLD) \
+		--max-steps $(DEMO_MAX_STEPS)

 bc: $(BC_POLICY)
 $(BC_POLICY): $(BC_DEMOS)
@@ -69,20 +120,44 @@ rl: $(RL_POLICY)
 $(RL_POLICY): $(BC_POLICY)
 	$(PY) -m training.rl.train \
 		--bc $(BC_DIR) --out $(RL_DIR) \
-		--total-timesteps $(PPO_STEPS) --kl-coef $(KL)
+		--total-timesteps $(PPO_STEPS) --kl-coef $(KL) \
+		--imitate-weight $(IMITATE) --time-weight $(TIME_W) \
+		--difficulty $(DIFFICULTY) \
+		--drive-mode $(DRIVE) --world $(WORLD)

 eval: $(RL_POLICY)
 	$(PY) -m training.eval --policy $(RL_DIR) \
-		--max-flock 10 --max-steps $(EVAL_MAX_STEPS) --n-seeds $(EVAL_SEEDS)
+		--max-flock 10 --max-steps $(EVAL_MAX_STEPS) --n-seeds $(EVAL_SEEDS) \
+		--drive-mode $(DRIVE) --world $(WORLD)

 test:
 	$(PY) -m pytest tests/

 webots:
-	tools/run_webots.sh $(N) $(MODE)
+	tools/run_webots.sh $(N) $(MODE) $(DRIVE) $(WORLD)

 clean:
-	rm -rf $(BC_DEMOS) $(BC_DIR) $(RL_DIR)
+	rm -f $(BC_DEMOS)
+	rm -rf $(BC_DIR) $(RL_DIR)
+
+clean_all:
+	rm -f training/bc/demos_*.npz
+	rm -rf training/runs/bc_* training/runs/rl_*
+
+# --- Train all 4 combinations ---
+train_diff_rect:
+	$(MAKE) DRIVE=differential WORLD=field
+
+train_diff_round:
+	$(MAKE) DRIVE=differential WORLD=field_round
+
+train_mec_rect:
+	$(MAKE) DRIVE=mecanum WORLD=field
+
+train_mec_round:
+	$(MAKE) DRIVE=mecanum WORLD=field_round
+
+train_all: train_diff_rect train_diff_round train_mec_rect train_mec_round

 help:
 	@echo "Targets:"
@@ -92,12 +167,21 @@ help:
 	@echo "  make rl           KL-PPO fine-tune (rebuilds bc if missing)"
 	@echo "  make eval         $(EVAL_SEEDS)-seed env eval of rl"
 	@echo "  make test         pytest suite"
-	@echo "  make webots [N=$(N)] [MODE=$(MODE)]"
+	@echo "  make webots [N=$(N)] [MODE=$(MODE)] [DRIVE=$(DRIVE)] [WORLD=$(WORLD)]"
 	@echo "                    launch Webots in the chosen mode"
-	@echo "  make clean        delete bc_demos and run artefacts"
+	@echo "  WEBOTS_HEADLESS=1 make webots …   no 3D view + fast + --batch"
+	@echo "  make clean        delete artefacts for current DRIVE+WORLD"
+	@echo "  make clean_all    delete artefacts for all combinations"
+	@echo ""
+	@echo "Combinations:"
+	@echo "  make DRIVE=differential WORLD=field       diff + rectangular (default)"
+	@echo "  make DRIVE=differential WORLD=field_round  diff + circular"
+	@echo "  make DRIVE=mecanum     WORLD=field         mecanum + rectangular"
+	@echo "  make DRIVE=mecanum     WORLD=field_round   mecanum + circular"
+	@echo "  make train_all                            all 4 in sequence"
 	@echo ""
 	@echo "Hyperparameter overrides (showing defaults):"
-	@echo "  TEACHER=$(TEACHER) SEEDS_PER_N=$(SEEDS_PER_N) SUBSAMPLE=$(SUBSAMPLE) FRAME_STACK=$(FRAME_STACK)"
+	@echo "  TEACHER=$(TEACHER) SEEDS_PER_N=$(SEEDS_PER_N) SUBSAMPLE=$(SUBSAMPLE) FRAME_STACK=$(FRAME_STACK) DEMO_MAX_STEPS=$(DEMO_MAX_STEPS)"
 	@echo "  BC_EPOCHS=$(BC_EPOCHS) BC_NET_ARCH=$(BC_NET_ARCH)"
-	@echo "  PPO_STEPS=$(PPO_STEPS) KL=$(KL)"
+	@echo "  PPO_STEPS=$(PPO_STEPS) KL=$(KL) IMITATE=$(IMITATE) TIME_W=$(TIME_W)"
 	@echo "  EVAL_SEEDS=$(EVAL_SEEDS) EVAL_MAX_STEPS=$(EVAL_MAX_STEPS)"