# Training pipeline for the shepherd-dog herding project.
# Stages chain via output files in training/.
#
# Usage:
#   make            # full pipeline: bc_demos -> bc -> rl -> eval
#   make bc_demos   # generate sim demos
#   make bc         # behaviour clone (rebuilds bc_demos if missing)
#   make rl         # KL-PPO fine-tune (rebuilds bc if missing)
#   make eval       # 10-seed env eval of rl
#   make test       # pytest suite
#   make webots N=10 MODE=rl   # launch Webots in the chosen mode
#   WEBOTS_HEADLESS=1 make webots   # no 3D view, fast mode (still needs DISPLAY or xvfb-run)
#   make clean      # delete bc_demos and run artefacts
#   make clean_all  # delete artefacts for all combinations
#   make help       # print the target table
#
# Override any hyperparameter on the command line, for example:
#   make rl PPO_STEPS=2000000 KL=0.02
#   make eval EVAL_SEEDS=20
#
# Drive mode selects the locomotion model:
#   make DRIVE=differential       2-wheel diff-drive (default)
#   make DRIVE=mecanum             4-wheel omnidirectional
#
# World shape:
#   make WORLD=field              rectangular (default)
#   make WORLD=field_round        circular fence
#
# To train all 4 combinations:
#   make train_all


PY               := python

# Drive mode and world shape — each combination gets its own artefacts.
DRIVE            ?= differential
WORLD            ?= field

# Derived tag and paths.
TAG               = $(DRIVE)_$(WORLD)
BC_DEMOS          = training/bc/demos_$(TAG).npz
BC_DIR            = training/runs/bc_$(TAG)
RL_DIR            = training/runs/rl_$(TAG)
# Stage-2 "speed pass": continue PPO from RL_DIR with TIME_W < 0 so the
# policy keeps Stage-1's success rate but cuts time-to-pen.  Output is a
# separate run dir so Stage-1 stays comparable.
RL_FAST_DIR       = training/runs/rl_fast_$(TAG)
BC_POLICY         = $(BC_DIR)/policy.zip
RL_POLICY         = $(RL_DIR)/policy.zip
RL_FAST_POLICY    = $(RL_FAST_DIR)/policy.zip

# --- Demo collection ---
TEACHER          ?= universal
# Mecanum has more complex dynamics and a weaker teacher imitation signal
# (val_cos ≈ 0.70 vs ≥ 0.88 for differential).  Give it more demos and
# longer BC training to compensate.
ifeq ($(DRIVE),mecanum)
ifeq ($(WORLD),field_round)
SEEDS_PER_N      ?= 80
else
SEEDS_PER_N      ?= 50
endif
else
# Round field is harder; more demos give BC a fair shot at 60%+.
ifeq ($(WORLD),field_round)
SEEDS_PER_N      ?= 60
else
SEEDS_PER_N      ?= 25
endif
endif
SUBSAMPLE        ?= 3
FRAME_STACK      ?= 4
DEMO_MAX_STEPS   ?= 100000

# --- Behaviour cloning ---
ifeq ($(DRIVE),mecanum)
ifeq ($(WORLD),field_round)
BC_EPOCHS        ?= 200
else
BC_EPOCHS        ?= 100
endif
else
ifeq ($(WORLD),field_round)
BC_EPOCHS        ?= 150
else
BC_EPOCHS        ?= 60
endif
endif
BC_NET_ARCH      ?= 512,512

# --- Domain randomisation (used by bc_demos and rl targets) ---
# FP_RATE: mean false-positive detections injected per step (Poisson λ).
# ACTION_SMOOTH_TRAIN: EMA on actions to match Webots controller (0.55).
# WHEEL_SLIP_STD: Gaussian wheel-speed noise for mecanum dynamics gap.
#
# FP_RATE is used consistently in BC demos *and* RL: BC collection runs
# in PRIVILEGED mode (teacher sees GT; student obs sees the FP-injected
# tracker output), so the policy learns to denoise to the GT signal.
# Mismatched FP_RATE between BC/RL was the root cause of an earlier
# regression (BC=0, RL=2 → PPO stalled at 0% success).
FP_RATE          ?= 0.0
ACTION_SMOOTH_TRAIN ?= 0.55
WHEEL_SLIP_STD   ?= 0.05

# --- KL-PPO fine-tune ---
# Round field: longer training, looser KL, no time penalty (success
# must be learned before speed is rewarded).
ifeq ($(WORLD),field_round)
PPO_STEPS        ?= 4000000
KL               ?= 0.02
else
PPO_STEPS        ?= 2000000
KL               ?= 0.05
endif
# Time penalty is 0 until success rate is high. Earlier runs showed
# TIME_W=-0.05 traded ~10 pts of success for speed on hard combos —
# learn to succeed first, optimize speed in a later pass.
TIME_W           ?= 0.0
IMITATE          ?= 0.0
# PPO rollouts at full difficulty so the training distribution matches
# eval (deployment).  Anything lower causes a train/eval mismatch that
# can make RL eval worse than BC.
DIFFICULTY       ?= 1.0

# --- Stage-2 "speed pass" (rl_fast) ---
# Continues from RL_DIR with a negative TIME_W. Tighter KL keeps the
# policy near the Stage-1 success rate while step-count drops.
# Differential and mecanum respond differently: mecanum needs a stronger
# time penalty to achieve speed gains; differential only needs a light
# touch (-0.02) — stronger penalties trade success for speed without gain.
RL_FAST_STEPS    ?= 1000000
RL_FAST_KL       ?= 0.05
ifeq ($(DRIVE),mecanum)
RL_FAST_TIME_W   ?= -0.05
else
RL_FAST_TIME_W   ?= -0.02
endif

# --- Evaluation ---
EVAL_SEEDS       ?= 10
EVAL_MAX_STEPS   ?= 15000

# --- Webots launcher ---
N                ?= 10
MODE             ?= rl


.PHONY: all bc_demos bc rl rl_fast eval eval_fast eval_all eval_all_fast \
        test webots webots_sweep clean clean_all help \
        train_all train_diff_rect train_diff_round \
        train_mec_rect train_mec_round \
        train_all_fast train_diff_rect_fast train_diff_round_fast \
        train_mec_rect_fast train_mec_round_fast \
        remote_full

all: eval

# Export HERDING_WORLD so that geometry.py picks it up at import time.
export HERDING_WORLD = $(WORLD)
# Force Python stdout/stderr unbuffered so progress is visible live when
# the build is run under tee / nohup / tmux pipes.
export PYTHONUNBUFFERED = 1

# Mecanum needs --use-webots-preset so collect/rl pick up
# HERDING_MEC_WEBOTS — the gym mecanum kinematics get the strafe
# efficiency and forward-bleed match against the physical-roller
# Webots proto. Without this flag the policy trains on textbook
# X-pattern mecanum and fails on deployment.
ifeq ($(DRIVE),mecanum)
WEBOTS_PRESET_FLAG = --use-webots-preset
else
WEBOTS_PRESET_FLAG =
endif

bc_demos: $(BC_DEMOS)
$(BC_DEMOS):
	$(PY) -m training.bc.collect \
		--teacher $(TEACHER) --out $(BC_DEMOS) \
		--seeds-per-n $(SEEDS_PER_N) --subsample $(SUBSAMPLE) \
		--frame-stack $(FRAME_STACK) --drive-mode $(DRIVE) \
		--world $(WORLD) \
		--max-steps $(DEMO_MAX_STEPS) \
		--fp-rate $(FP_RATE) \
		--action-smooth $(ACTION_SMOOTH_TRAIN) \
		--wheel-slip-std $(WHEEL_SLIP_STD) \
		$(WEBOTS_PRESET_FLAG)

bc: $(BC_POLICY)
$(BC_POLICY): $(BC_DEMOS)
	$(PY) -m training.bc.pretrain \
		--demos $(BC_DEMOS) --out $(BC_DIR) \
		--epochs $(BC_EPOCHS) --net-arch $(BC_NET_ARCH)

rl: $(RL_POLICY)
$(RL_POLICY): $(BC_POLICY)
	$(PY) -m training.rl.train \
		--bc $(BC_DIR) --out $(RL_DIR) \
		--total-timesteps $(PPO_STEPS) --kl-coef $(KL) \
		--imitate-weight $(IMITATE) --time-weight $(TIME_W) \
		--difficulty $(DIFFICULTY) \
		--drive-mode $(DRIVE) --world $(WORLD) \
		--fp-rate $(FP_RATE) \
		--action-smooth $(ACTION_SMOOTH_TRAIN) \
		--wheel-slip-std $(WHEEL_SLIP_STD)
	# (rl/train.py auto-applies HERDING_MEC_WEBOTS when drive=mecanum;
	# no --use-webots-preset flag needed.)

eval: $(RL_POLICY)
	$(PY) -m training.eval --policy $(RL_DIR) \
		--max-flock 10 --max-steps $(EVAL_MAX_STEPS) --n-seeds $(EVAL_SEEDS) \
		--drive-mode $(DRIVE) --world $(WORLD)

# --- Stage-2 speed pass ---
# Continues PPO from $(RL_DIR) with a per-step time penalty so the
# policy keeps Stage-1's success rate but cuts mean steps-to-pen. Use
# `make rl_fast` after Stage-1 RL has converged (success ≥ teacher).
rl_fast: $(RL_FAST_POLICY)
$(RL_FAST_POLICY): $(RL_POLICY)
	$(PY) -m training.rl.train \
		--bc $(RL_DIR) --out $(RL_FAST_DIR) \
		--total-timesteps $(RL_FAST_STEPS) --kl-coef $(RL_FAST_KL) \
		--imitate-weight $(IMITATE) --time-weight $(RL_FAST_TIME_W) \
		--difficulty $(DIFFICULTY) \
		--drive-mode $(DRIVE) --world $(WORLD) \
		--fp-rate $(FP_RATE) \
		--action-smooth $(ACTION_SMOOTH_TRAIN) \
		--wheel-slip-std $(WHEEL_SLIP_STD)

eval_fast: $(RL_FAST_POLICY)
	$(PY) -m training.eval --policy $(RL_FAST_DIR) \
		--max-flock 10 --max-steps $(EVAL_MAX_STEPS) --n-seeds $(EVAL_SEEDS) \
		--drive-mode $(DRIVE) --world $(WORLD)

test:
	$(PY) -m pytest tests/

webots:
	@bash tools/webots_menu.sh

# Headless sweep across all modes × worlds × flock sizes.
# Results are written to webots_sweep.log.
# Set USE_GT=1 to bypass LiDAR tracker (isolate perception from policy).
webots_sweep:
	env $(if $(USE_GT),HERDING_USE_GT=1,) \
	    PATH="$(CONDA_PREFIX)/bin:$(PATH)" \
	    bash tools/webots_sweep.sh webots_sweep.log

clean:
	rm -f $(BC_DEMOS)
	rm -rf $(BC_DIR) $(RL_DIR)

clean_all:
	rm -f training/bc/demos_*.npz
	rm -rf training/runs/bc_* training/runs/rl_*

# --- Train all 4 combinations ---
train_diff_rect:
	$(MAKE) DRIVE=differential WORLD=field

train_diff_round:
	$(MAKE) DRIVE=differential WORLD=field_round

train_mec_rect:
	$(MAKE) DRIVE=mecanum WORLD=field

train_mec_round:
	$(MAKE) DRIVE=mecanum WORLD=field_round

train_all: train_diff_rect train_diff_round train_mec_rect train_mec_round

# Gym eval sweep over all 4 combos. Use after train_all / train_all_fast.
eval_all:
	@for d in differential mecanum; do \
	  for w in field field_round; do \
	    echo ""; \
	    echo "=== BC  $$d / $$w ==="; \
	    $(PY) -m training.eval --policy training/runs/bc_$${d}_$${w} \
	      --max-flock 10 --max-steps $(EVAL_MAX_STEPS) --n-seeds $(EVAL_SEEDS) \
	      --drive-mode $$d --world $$w; \
	    echo ""; \
	    echo "=== RL  $$d / $$w ==="; \
	    $(PY) -m training.eval --policy training/runs/rl_$${d}_$${w} \
	      --max-flock 10 --max-steps $(EVAL_MAX_STEPS) --n-seeds $(EVAL_SEEDS) \
	      --drive-mode $$d --world $$w; \
	  done; \
	done

# One-shot remote runbook: clean → Stage-1 train → Stage-1 eval → Stage-2
# train → Stage-2 eval. Each step pipes to its own log file in the repo
# root so the run is fully unattended.
remote_full:
	$(MAKE) clean_all
	$(MAKE) train_all 2>&1 | tee stage1_train.log
	$(MAKE) eval_all 2>&1 | tee stage1_eval.log
	$(MAKE) train_all_fast 2>&1 | tee stage2_train.log
	$(MAKE) eval_all_fast 2>&1 | tee stage2_eval.log
	@echo ""
	@echo "===================================================="
	@echo "  Done. Logs: stage1_train.log stage1_eval.log"
	@echo "              stage2_train.log stage2_eval.log"
	@echo "===================================================="

eval_all_fast:
	@for d in differential mecanum; do \
	  for w in field field_round; do \
	    echo ""; \
	    echo "=== RL_FAST  $$d / $$w ==="; \
	    $(PY) -m training.eval --policy training/runs/rl_fast_$${d}_$${w} \
	      --max-flock 10 --max-steps $(EVAL_MAX_STEPS) --n-seeds $(EVAL_SEEDS) \
	      --drive-mode $$d --world $$w; \
	  done; \
	done

# --- Stage-2 sweep ---
train_diff_rect_fast:
	$(MAKE) DRIVE=differential WORLD=field rl_fast

train_diff_round_fast:
	$(MAKE) DRIVE=differential WORLD=field_round rl_fast

train_mec_rect_fast:
	$(MAKE) DRIVE=mecanum WORLD=field rl_fast

train_mec_round_fast:
	$(MAKE) DRIVE=mecanum WORLD=field_round rl_fast

train_all_fast: train_diff_rect_fast train_diff_round_fast \
                train_mec_rect_fast train_mec_round_fast

help:
	@echo "Targets:"
	@echo "  make              full pipeline (bc_demos -> bc -> rl -> eval)"
	@echo "  make bc_demos     sim demos via the '$(TEACHER)' teacher"
	@echo "  make bc           train BC (rebuilds bc_demos if missing)"
	@echo "  make rl           KL-PPO fine-tune (rebuilds bc if missing)"
	@echo "  make eval         $(EVAL_SEEDS)-seed env eval of rl"
	@echo "  make test         pytest suite"
	@echo "  make webots [N=$(N)] [MODE=$(MODE)] [DRIVE=$(DRIVE)] [WORLD=$(WORLD)]"
	@echo "                    launch Webots in the chosen mode"
	@echo "  WEBOTS_HEADLESS=1 make webots …   no 3D view + fast + --batch"
	@echo "  make clean        delete artefacts for current DRIVE+WORLD"
	@echo "  make clean_all    delete artefacts for all combinations"
	@echo ""
	@echo "Combinations:"
	@echo "  make DRIVE=differential WORLD=field       diff + rectangular (default)"
	@echo "  make DRIVE=differential WORLD=field_round  diff + circular"
	@echo "  make DRIVE=mecanum     WORLD=field         mecanum + rectangular"
	@echo "  make DRIVE=mecanum     WORLD=field_round   mecanum + circular"
	@echo "  make train_all                            all 4 in sequence"
	@echo ""
	@echo "Hyperparameter overrides (showing defaults):"
	@echo "  TEACHER=$(TEACHER) SEEDS_PER_N=$(SEEDS_PER_N) SUBSAMPLE=$(SUBSAMPLE) FRAME_STACK=$(FRAME_STACK) DEMO_MAX_STEPS=$(DEMO_MAX_STEPS)"
	@echo "  BC_EPOCHS=$(BC_EPOCHS) BC_NET_ARCH=$(BC_NET_ARCH)"
	@echo "  PPO_STEPS=$(PPO_STEPS) KL=$(KL) IMITATE=$(IMITATE) TIME_W=$(TIME_W)"
	@echo "  EVAL_SEEDS=$(EVAL_SEEDS) EVAL_MAX_STEPS=$(EVAL_MAX_STEPS)"