diff --git a/Makefile b/Makefile index 07cc553..4dfde82 100644 --- a/Makefile +++ b/Makefile @@ -41,15 +41,20 @@ TAG = $(DRIVE)_$(WORLD) BC_DEMOS = training/bc/demos_$(TAG).npz BC_DIR = training/runs/bc_$(TAG) RL_DIR = training/runs/rl_$(TAG) +# Stage-2 "speed pass": continue PPO from RL_DIR with TIME_W < 0 so the +# policy keeps Stage-1's success rate but cuts time-to-pen. Output is a +# separate run dir so Stage-1 stays comparable. +RL_FAST_DIR = training/runs/rl_fast_$(TAG) BC_POLICY = $(BC_DIR)/policy.zip RL_POLICY = $(RL_DIR)/policy.zip +RL_FAST_POLICY = $(RL_FAST_DIR)/policy.zip # --- Demo collection --- TEACHER ?= universal # Round field is fundamentally harder (narrow gate at south of a circle). # Default to more demos there to give BC a fair shot at 60%+. ifeq ($(WORLD),field_round) -SEEDS_PER_N ?= 40 +SEEDS_PER_N ?= 60 else SEEDS_PER_N ?= 25 endif @@ -59,7 +64,7 @@ DEMO_MAX_STEPS ?= 100000 # --- Behaviour cloning --- ifeq ($(WORLD),field_round) -BC_EPOCHS ?= 100 +BC_EPOCHS ?= 150 else BC_EPOCHS ?= 60 endif @@ -85,6 +90,13 @@ IMITATE ?= 0.0 # can make RL eval worse than BC. DIFFICULTY ?= 1.0 +# --- Stage-2 "speed pass" (rl_fast) --- +# Continues from RL_DIR with a negative TIME_W. Tighter KL keeps the +# policy near the Stage-1 success rate while step-count drops. +RL_FAST_STEPS ?= 1000000 +RL_FAST_KL ?= 0.05 +RL_FAST_TIME_W ?= -0.05 + # --- Evaluation --- EVAL_SEEDS ?= 10 EVAL_MAX_STEPS ?= 15000 @@ -94,9 +106,13 @@ N ?= 10 MODE ?= rl -.PHONY: all bc_demos bc rl eval test webots clean clean_all help \ +.PHONY: all bc_demos bc rl rl_fast eval eval_fast eval_all eval_all_fast \ + test webots clean clean_all help \ train_all train_diff_rect train_diff_round \ - train_mec_rect train_mec_round + train_mec_rect train_mec_round \ + train_all_fast train_diff_rect_fast train_diff_round_fast \ + train_mec_rect_fast train_mec_round_fast \ + remote_full all: eval @@ -132,6 +148,24 @@ eval: $(RL_POLICY) --max-flock 10 --max-steps $(EVAL_MAX_STEPS) --n-seeds $(EVAL_SEEDS) \ --drive-mode $(DRIVE) --world $(WORLD) +# --- Stage-2 speed pass --- +# Continues PPO from $(RL_DIR) with a per-step time penalty so the +# policy keeps Stage-1's success rate but cuts mean steps-to-pen. Use +# `make rl_fast` after Stage-1 RL has converged (success ≥ teacher). +rl_fast: $(RL_FAST_POLICY) +$(RL_FAST_POLICY): $(RL_POLICY) + $(PY) -m training.rl.train \ + --bc $(RL_DIR) --out $(RL_FAST_DIR) \ + --total-timesteps $(RL_FAST_STEPS) --kl-coef $(RL_FAST_KL) \ + --imitate-weight $(IMITATE) --time-weight $(RL_FAST_TIME_W) \ + --difficulty $(DIFFICULTY) \ + --drive-mode $(DRIVE) --world $(WORLD) + +eval_fast: $(RL_FAST_POLICY) + $(PY) -m training.eval --policy $(RL_FAST_DIR) \ + --max-flock 10 --max-steps $(EVAL_MAX_STEPS) --n-seeds $(EVAL_SEEDS) \ + --drive-mode $(DRIVE) --world $(WORLD) + test: $(PY) -m pytest tests/ @@ -161,6 +195,65 @@ train_mec_round: train_all: train_diff_rect train_diff_round train_mec_rect train_mec_round +# Gym eval sweep over all 4 combos. Use after train_all / train_all_fast. +eval_all: + @for d in differential mecanum; do \ + for w in field field_round; do \ + echo ""; \ + echo "=== BC $$d / $$w ==="; \ + $(PY) -m training.eval --policy training/runs/bc_$${d}_$${w} \ + --max-flock 10 --max-steps $(EVAL_MAX_STEPS) --n-seeds $(EVAL_SEEDS) \ + --drive-mode $$d --world $$w; \ + echo ""; \ + echo "=== RL $$d / $$w ==="; \ + $(PY) -m training.eval --policy training/runs/rl_$${d}_$${w} \ + --max-flock 10 --max-steps $(EVAL_MAX_STEPS) --n-seeds $(EVAL_SEEDS) \ + --drive-mode $$d --world $$w; \ + done; \ + done + +# One-shot remote runbook: clean → Stage-1 train → Stage-1 eval → Stage-2 +# train → Stage-2 eval. Each step pipes to its own log file in the repo +# root so the run is fully unattended. +remote_full: + $(MAKE) clean_all + $(MAKE) train_all 2>&1 | tee stage1_train.log + $(MAKE) eval_all 2>&1 | tee stage1_eval.log + $(MAKE) train_all_fast 2>&1 | tee stage2_train.log + $(MAKE) eval_all_fast 2>&1 | tee stage2_eval.log + @echo "" + @echo "====================================================" + @echo " Done. Logs: stage1_train.log stage1_eval.log" + @echo " stage2_train.log stage2_eval.log" + @echo "====================================================" + +eval_all_fast: + @for d in differential mecanum; do \ + for w in field field_round; do \ + echo ""; \ + echo "=== RL_FAST $$d / $$w ==="; \ + $(PY) -m training.eval --policy training/runs/rl_fast_$${d}_$${w} \ + --max-flock 10 --max-steps $(EVAL_MAX_STEPS) --n-seeds $(EVAL_SEEDS) \ + --drive-mode $$d --world $$w; \ + done; \ + done + +# --- Stage-2 sweep --- +train_diff_rect_fast: + $(MAKE) DRIVE=differential WORLD=field rl_fast + +train_diff_round_fast: + $(MAKE) DRIVE=differential WORLD=field_round rl_fast + +train_mec_rect_fast: + $(MAKE) DRIVE=mecanum WORLD=field rl_fast + +train_mec_round_fast: + $(MAKE) DRIVE=mecanum WORLD=field_round rl_fast + +train_all_fast: train_diff_rect_fast train_diff_round_fast \ + train_mec_rect_fast train_mec_round_fast + help: @echo "Targets:" @echo " make full pipeline (bc_demos -> bc -> rl -> eval)" diff --git a/stage1_train.log b/stage1_train.log new file mode 100644 index 0000000..210fc8d --- /dev/null +++ b/stage1_train.log @@ -0,0 +1,7 @@ +make[1]: Entering directory '/run/host/home/johnnyf/Documents/Projects/TIR/project' +make DRIVE=differential WORLD=field +make[2]: Entering directory '/run/host/home/johnnyf/Documents/Projects/TIR/project' +python -m training.eval --policy training/runs/rl_differential_field \ + --max-flock 10 --max-steps 15000 --n-seeds 10 \ + --drive-mode differential --world field +make[2]: Leaving directory '/run/host/home/johnnyf/Documents/Projects/TIR/project'