# Training pipeline for the shepherd-dog herding project. # Stages chain via output files in training/. # # Usage: # make # full pipeline: bc_demos -> bc -> rl -> eval # make bc_demos # generate sim demos # make bc # behaviour clone (rebuilds bc_demos if missing) # make rl # KL-PPO fine-tune (rebuilds bc if missing) # make eval # 10-seed env eval of rl # make test # pytest suite # make webots N=10 MODE=rl # launch Webots in the chosen mode # WEBOTS_HEADLESS=1 make webots # no 3D view, fast mode (still needs DISPLAY or xvfb-run) # make clean # delete bc_demos and run artefacts # make clean_all # delete artefacts for all combinations # make help # print the target table # # Override any hyperparameter on the command line, for example: # make rl PPO_STEPS=2000000 KL=0.02 # make eval EVAL_SEEDS=20 # # Drive mode selects the locomotion model: # make DRIVE=differential 2-wheel diff-drive (default) # make DRIVE=mecanum 4-wheel omnidirectional # # World shape: # make WORLD=field rectangular (default) # make WORLD=field_round circular fence # # To train all 4 combinations: # make train_all PY := python # Drive mode and world shape — each combination gets its own artefacts. DRIVE ?= differential WORLD ?= field # Derived tag and paths. TAG = $(DRIVE)_$(WORLD) BC_DEMOS = training/bc/demos_$(TAG).npz BC_DIR = training/runs/bc_$(TAG) RL_DIR = training/runs/rl_$(TAG) BC_POLICY = $(BC_DIR)/policy.zip RL_POLICY = $(RL_DIR)/policy.zip # --- Demo collection --- TEACHER ?= universal # Round field is fundamentally harder (narrow gate at south of a circle). # Default to more demos there to give BC a fair shot at 60%+. ifeq ($(WORLD),field_round) SEEDS_PER_N ?= 40 else SEEDS_PER_N ?= 25 endif SUBSAMPLE ?= 3 FRAME_STACK ?= 4 DEMO_MAX_STEPS ?= 100000 # --- Behaviour cloning --- ifeq ($(WORLD),field_round) BC_EPOCHS ?= 100 else BC_EPOCHS ?= 60 endif BC_NET_ARCH ?= 512,512 # --- KL-PPO fine-tune --- # Round field: longer training, looser KL, no time penalty (success # must be learned before speed is rewarded). ifeq ($(WORLD),field_round) PPO_STEPS ?= 4000000 KL ?= 0.02 TIME_W ?= 0.0 else PPO_STEPS ?= 2000000 KL ?= 0.05 TIME_W ?= -0.05 endif IMITATE ?= 0.0 # PPO rollouts at full difficulty so the training distribution matches # eval (deployment). Anything lower causes a train/eval mismatch that # can make RL eval worse than BC. DIFFICULTY ?= 1.0 # --- Evaluation --- EVAL_SEEDS ?= 10 EVAL_MAX_STEPS ?= 15000 # --- Webots launcher --- N ?= 10 MODE ?= rl .PHONY: all bc_demos bc rl eval test webots clean clean_all help \ train_all train_diff_rect train_diff_round \ train_mec_rect train_mec_round all: eval # Export HERDING_WORLD so that geometry.py picks it up at import time. export HERDING_WORLD = $(WORLD) bc_demos: $(BC_DEMOS) $(BC_DEMOS): $(PY) -m training.bc.collect \ --teacher $(TEACHER) --out $(BC_DEMOS) \ --seeds-per-n $(SEEDS_PER_N) --subsample $(SUBSAMPLE) \ --frame-stack $(FRAME_STACK) --drive-mode $(DRIVE) \ --world $(WORLD) \ --max-steps $(DEMO_MAX_STEPS) bc: $(BC_POLICY) $(BC_POLICY): $(BC_DEMOS) $(PY) -m training.bc.pretrain \ --demos $(BC_DEMOS) --out $(BC_DIR) \ --epochs $(BC_EPOCHS) --net-arch $(BC_NET_ARCH) rl: $(RL_POLICY) $(RL_POLICY): $(BC_POLICY) $(PY) -m training.rl.train \ --bc $(BC_DIR) --out $(RL_DIR) \ --total-timesteps $(PPO_STEPS) --kl-coef $(KL) \ --imitate-weight $(IMITATE) --time-weight $(TIME_W) \ --difficulty $(DIFFICULTY) \ --drive-mode $(DRIVE) --world $(WORLD) eval: $(RL_POLICY) $(PY) -m training.eval --policy $(RL_DIR) \ --max-flock 10 --max-steps $(EVAL_MAX_STEPS) --n-seeds $(EVAL_SEEDS) \ --drive-mode $(DRIVE) --world $(WORLD) test: $(PY) -m pytest tests/ webots: tools/run_webots.sh $(N) $(MODE) $(DRIVE) $(WORLD) clean: rm -f $(BC_DEMOS) rm -rf $(BC_DIR) $(RL_DIR) clean_all: rm -f training/bc/demos_*.npz rm -rf training/runs/bc_* training/runs/rl_* # --- Train all 4 combinations --- train_diff_rect: $(MAKE) DRIVE=differential WORLD=field train_diff_round: $(MAKE) DRIVE=differential WORLD=field_round train_mec_rect: $(MAKE) DRIVE=mecanum WORLD=field train_mec_round: $(MAKE) DRIVE=mecanum WORLD=field_round train_all: train_diff_rect train_diff_round train_mec_rect train_mec_round help: @echo "Targets:" @echo " make full pipeline (bc_demos -> bc -> rl -> eval)" @echo " make bc_demos sim demos via the '$(TEACHER)' teacher" @echo " make bc train BC (rebuilds bc_demos if missing)" @echo " make rl KL-PPO fine-tune (rebuilds bc if missing)" @echo " make eval $(EVAL_SEEDS)-seed env eval of rl" @echo " make test pytest suite" @echo " make webots [N=$(N)] [MODE=$(MODE)] [DRIVE=$(DRIVE)] [WORLD=$(WORLD)]" @echo " launch Webots in the chosen mode" @echo " WEBOTS_HEADLESS=1 make webots … no 3D view + fast + --batch" @echo " make clean delete artefacts for current DRIVE+WORLD" @echo " make clean_all delete artefacts for all combinations" @echo "" @echo "Combinations:" @echo " make DRIVE=differential WORLD=field diff + rectangular (default)" @echo " make DRIVE=differential WORLD=field_round diff + circular" @echo " make DRIVE=mecanum WORLD=field mecanum + rectangular" @echo " make DRIVE=mecanum WORLD=field_round mecanum + circular" @echo " make train_all all 4 in sequence" @echo "" @echo "Hyperparameter overrides (showing defaults):" @echo " TEACHER=$(TEACHER) SEEDS_PER_N=$(SEEDS_PER_N) SUBSAMPLE=$(SUBSAMPLE) FRAME_STACK=$(FRAME_STACK) DEMO_MAX_STEPS=$(DEMO_MAX_STEPS)" @echo " BC_EPOCHS=$(BC_EPOCHS) BC_NET_ARCH=$(BC_NET_ARCH)" @echo " PPO_STEPS=$(PPO_STEPS) KL=$(KL) IMITATE=$(IMITATE) TIME_W=$(TIME_W)" @echo " EVAL_SEEDS=$(EVAL_SEEDS) EVAL_MAX_STEPS=$(EVAL_MAX_STEPS)"