Checkpoint 10

2026-05-13 23:14:16 +01:00
parent 0f807003a5
commit aa598fcb83
2 changed files with 104 additions and 4 deletions
@@ -41,15 +41,20 @@ TAG               = $(DRIVE)_$(WORLD)
 BC_DEMOS          = training/bc/demos_$(TAG).npz
 BC_DIR            = training/runs/bc_$(TAG)
 RL_DIR            = training/runs/rl_$(TAG)
 # Stage-2 "speed pass": continue PPO from RL_DIR with TIME_W < 0 so the
 # policy keeps Stage-1's success rate but cuts time-to-pen.  Output is a
 # separate run dir so Stage-1 stays comparable.
 RL_FAST_DIR       = training/runs/rl_fast_$(TAG)
 BC_POLICY         = $(BC_DIR)/policy.zip
 RL_POLICY         = $(RL_DIR)/policy.zip
 RL_FAST_POLICY    = $(RL_FAST_DIR)/policy.zip
 # --- Demo collection ---
 TEACHER          ?= universal
 # Round field is fundamentally harder (narrow gate at south of a circle).
 # Default to more demos there to give BC a fair shot at 60%+.
 ifeq ($(WORLD),field_round)
-SEEDS_PER_N      ?= 40
+SEEDS_PER_N      ?= 60
 else
 SEEDS_PER_N      ?= 25
 endif
@@ -59,7 +64,7 @@ DEMO_MAX_STEPS   ?= 100000
 # --- Behaviour cloning ---
 ifeq ($(WORLD),field_round)
-BC_EPOCHS        ?= 100
+BC_EPOCHS        ?= 150
 else
 BC_EPOCHS        ?= 60
 endif
@@ -85,6 +90,13 @@ IMITATE          ?= 0.0
 # can make RL eval worse than BC.
 DIFFICULTY       ?= 1.0
 # --- Stage-2 "speed pass" (rl_fast) ---
 # Continues from RL_DIR with a negative TIME_W. Tighter KL keeps the
 # policy near the Stage-1 success rate while step-count drops.
 RL_FAST_STEPS    ?= 1000000
 RL_FAST_KL       ?= 0.05
 RL_FAST_TIME_W   ?= -0.05
 # --- Evaluation ---
 EVAL_SEEDS       ?= 10
 EVAL_MAX_STEPS   ?= 15000
@@ -94,9 +106,13 @@ N                ?= 10
 MODE             ?= rl
-.PHONY: all bc_demos bc rl eval test webots clean clean_all help \
+.PHONY: all bc_demos bc rl rl_fast eval eval_fast eval_all eval_all_fast \
        test webots clean clean_all help \
        train_all train_diff_rect train_diff_round \
-        train_mec_rect train_mec_round
+        train_mec_rect train_mec_round \
        train_all_fast train_diff_rect_fast train_diff_round_fast \
        train_mec_rect_fast train_mec_round_fast \
        remote_full
 all: eval
@@ -132,6 +148,24 @@ eval: $(RL_POLICY)
 		--max-flock 10 --max-steps $(EVAL_MAX_STEPS) --n-seeds $(EVAL_SEEDS) \
 		--drive-mode $(DRIVE) --world $(WORLD)
 # --- Stage-2 speed pass ---
 # Continues PPO from $(RL_DIR) with a per-step time penalty so the
 # policy keeps Stage-1's success rate but cuts mean steps-to-pen. Use
 # `make rl_fast` after Stage-1 RL has converged (success ≥ teacher).
 rl_fast: $(RL_FAST_POLICY)
 $(RL_FAST_POLICY): $(RL_POLICY)
 	$(PY) -m training.rl.train \
 		--bc $(RL_DIR) --out $(RL_FAST_DIR) \
 		--total-timesteps $(RL_FAST_STEPS) --kl-coef $(RL_FAST_KL) \
 		--imitate-weight $(IMITATE) --time-weight $(RL_FAST_TIME_W) \
 		--difficulty $(DIFFICULTY) \
 		--drive-mode $(DRIVE) --world $(WORLD)
 eval_fast: $(RL_FAST_POLICY)
 	$(PY) -m training.eval --policy $(RL_FAST_DIR) \
 		--max-flock 10 --max-steps $(EVAL_MAX_STEPS) --n-seeds $(EVAL_SEEDS) \
 		--drive-mode $(DRIVE) --world $(WORLD)
 test:
 	$(PY) -m pytest tests/
@@ -161,6 +195,65 @@ train_mec_round:
 train_all: train_diff_rect train_diff_round train_mec_rect train_mec_round
 # Gym eval sweep over all 4 combos. Use after train_all / train_all_fast.
 eval_all:
 	@for d in differential mecanum; do \
 	  for w in field field_round; do \
 	    echo ""; \
 	    echo "=== BC  $$d / $$w ==="; \
 	    $(PY) -m training.eval --policy training/runs/bc_$${d}_$${w} \
 	      --max-flock 10 --max-steps $(EVAL_MAX_STEPS) --n-seeds $(EVAL_SEEDS) \
 	      --drive-mode $$d --world $$w; \
 	    echo ""; \
 	    echo "=== RL  $$d / $$w ==="; \
 	    $(PY) -m training.eval --policy training/runs/rl_$${d}_$${w} \
 	      --max-flock 10 --max-steps $(EVAL_MAX_STEPS) --n-seeds $(EVAL_SEEDS) \
 	      --drive-mode $$d --world $$w; \
 	  done; \
 	done
 # One-shot remote runbook: clean → Stage-1 train → Stage-1 eval → Stage-2
 # train → Stage-2 eval. Each step pipes to its own log file in the repo
 # root so the run is fully unattended.
 remote_full:
 	$(MAKE) clean_all
 	$(MAKE) train_all 2>&1 | tee stage1_train.log
 	$(MAKE) eval_all 2>&1 | tee stage1_eval.log
 	$(MAKE) train_all_fast 2>&1 | tee stage2_train.log
 	$(MAKE) eval_all_fast 2>&1 | tee stage2_eval.log
 	@echo ""
 	@echo "===================================================="
 	@echo "  Done. Logs: stage1_train.log stage1_eval.log"
 	@echo "              stage2_train.log stage2_eval.log"
 	@echo "===================================================="
 eval_all_fast:
 	@for d in differential mecanum; do \
 	  for w in field field_round; do \
 	    echo ""; \
 	    echo "=== RL_FAST  $$d / $$w ==="; \
 	    $(PY) -m training.eval --policy training/runs/rl_fast_$${d}_$${w} \
 	      --max-flock 10 --max-steps $(EVAL_MAX_STEPS) --n-seeds $(EVAL_SEEDS) \
 	      --drive-mode $$d --world $$w; \
 	  done; \
 	done
 # --- Stage-2 sweep ---
 train_diff_rect_fast:
 	$(MAKE) DRIVE=differential WORLD=field rl_fast
 train_diff_round_fast:
 	$(MAKE) DRIVE=differential WORLD=field_round rl_fast
 train_mec_rect_fast:
 	$(MAKE) DRIVE=mecanum WORLD=field rl_fast
 train_mec_round_fast:
 	$(MAKE) DRIVE=mecanum WORLD=field_round rl_fast
 train_all_fast: train_diff_rect_fast train_diff_round_fast \
                train_mec_rect_fast train_mec_round_fast
 help:
 	@echo "Targets:"
 	@echo "  make              full pipeline (bc_demos -> bc -> rl -> eval)"
@@ -0,0 +1,7 @@
 make[1]: Entering directory '/run/host/home/johnnyf/Documents/Projects/TIR/project'
 make DRIVE=differential WORLD=field
 make[2]: Entering directory '/run/host/home/johnnyf/Documents/Projects/TIR/project'
 python -m training.eval --policy training/runs/rl_differential_field \
 	--max-flock 10 --max-steps 15000 --n-seeds 10 \
 	--drive-mode differential --world field
 make[2]: Leaving directory '/run/host/home/johnnyf/Documents/Projects/TIR/project'