Checkpoint 10
This commit is contained in:
@@ -41,15 +41,20 @@ TAG = $(DRIVE)_$(WORLD)
|
|||||||
BC_DEMOS = training/bc/demos_$(TAG).npz
|
BC_DEMOS = training/bc/demos_$(TAG).npz
|
||||||
BC_DIR = training/runs/bc_$(TAG)
|
BC_DIR = training/runs/bc_$(TAG)
|
||||||
RL_DIR = training/runs/rl_$(TAG)
|
RL_DIR = training/runs/rl_$(TAG)
|
||||||
|
# Stage-2 "speed pass": continue PPO from RL_DIR with TIME_W < 0 so the
|
||||||
|
# policy keeps Stage-1's success rate but cuts time-to-pen. Output is a
|
||||||
|
# separate run dir so Stage-1 stays comparable.
|
||||||
|
RL_FAST_DIR = training/runs/rl_fast_$(TAG)
|
||||||
BC_POLICY = $(BC_DIR)/policy.zip
|
BC_POLICY = $(BC_DIR)/policy.zip
|
||||||
RL_POLICY = $(RL_DIR)/policy.zip
|
RL_POLICY = $(RL_DIR)/policy.zip
|
||||||
|
RL_FAST_POLICY = $(RL_FAST_DIR)/policy.zip
|
||||||
|
|
||||||
# --- Demo collection ---
|
# --- Demo collection ---
|
||||||
TEACHER ?= universal
|
TEACHER ?= universal
|
||||||
# Round field is fundamentally harder (narrow gate at south of a circle).
|
# Round field is fundamentally harder (narrow gate at south of a circle).
|
||||||
# Default to more demos there to give BC a fair shot at 60%+.
|
# Default to more demos there to give BC a fair shot at 60%+.
|
||||||
ifeq ($(WORLD),field_round)
|
ifeq ($(WORLD),field_round)
|
||||||
SEEDS_PER_N ?= 40
|
SEEDS_PER_N ?= 60
|
||||||
else
|
else
|
||||||
SEEDS_PER_N ?= 25
|
SEEDS_PER_N ?= 25
|
||||||
endif
|
endif
|
||||||
@@ -59,7 +64,7 @@ DEMO_MAX_STEPS ?= 100000
|
|||||||
|
|
||||||
# --- Behaviour cloning ---
|
# --- Behaviour cloning ---
|
||||||
ifeq ($(WORLD),field_round)
|
ifeq ($(WORLD),field_round)
|
||||||
BC_EPOCHS ?= 100
|
BC_EPOCHS ?= 150
|
||||||
else
|
else
|
||||||
BC_EPOCHS ?= 60
|
BC_EPOCHS ?= 60
|
||||||
endif
|
endif
|
||||||
@@ -85,6 +90,13 @@ IMITATE ?= 0.0
|
|||||||
# can make RL eval worse than BC.
|
# can make RL eval worse than BC.
|
||||||
DIFFICULTY ?= 1.0
|
DIFFICULTY ?= 1.0
|
||||||
|
|
||||||
|
# --- Stage-2 "speed pass" (rl_fast) ---
|
||||||
|
# Continues from RL_DIR with a negative TIME_W. Tighter KL keeps the
|
||||||
|
# policy near the Stage-1 success rate while step-count drops.
|
||||||
|
RL_FAST_STEPS ?= 1000000
|
||||||
|
RL_FAST_KL ?= 0.05
|
||||||
|
RL_FAST_TIME_W ?= -0.05
|
||||||
|
|
||||||
# --- Evaluation ---
|
# --- Evaluation ---
|
||||||
EVAL_SEEDS ?= 10
|
EVAL_SEEDS ?= 10
|
||||||
EVAL_MAX_STEPS ?= 15000
|
EVAL_MAX_STEPS ?= 15000
|
||||||
@@ -94,9 +106,13 @@ N ?= 10
|
|||||||
MODE ?= rl
|
MODE ?= rl
|
||||||
|
|
||||||
|
|
||||||
.PHONY: all bc_demos bc rl eval test webots clean clean_all help \
|
.PHONY: all bc_demos bc rl rl_fast eval eval_fast eval_all eval_all_fast \
|
||||||
|
test webots clean clean_all help \
|
||||||
train_all train_diff_rect train_diff_round \
|
train_all train_diff_rect train_diff_round \
|
||||||
train_mec_rect train_mec_round
|
train_mec_rect train_mec_round \
|
||||||
|
train_all_fast train_diff_rect_fast train_diff_round_fast \
|
||||||
|
train_mec_rect_fast train_mec_round_fast \
|
||||||
|
remote_full
|
||||||
|
|
||||||
all: eval
|
all: eval
|
||||||
|
|
||||||
@@ -132,6 +148,24 @@ eval: $(RL_POLICY)
|
|||||||
--max-flock 10 --max-steps $(EVAL_MAX_STEPS) --n-seeds $(EVAL_SEEDS) \
|
--max-flock 10 --max-steps $(EVAL_MAX_STEPS) --n-seeds $(EVAL_SEEDS) \
|
||||||
--drive-mode $(DRIVE) --world $(WORLD)
|
--drive-mode $(DRIVE) --world $(WORLD)
|
||||||
|
|
||||||
|
# --- Stage-2 speed pass ---
|
||||||
|
# Continues PPO from $(RL_DIR) with a per-step time penalty so the
|
||||||
|
# policy keeps Stage-1's success rate but cuts mean steps-to-pen. Use
|
||||||
|
# `make rl_fast` after Stage-1 RL has converged (success ≥ teacher).
|
||||||
|
rl_fast: $(RL_FAST_POLICY)
|
||||||
|
$(RL_FAST_POLICY): $(RL_POLICY)
|
||||||
|
$(PY) -m training.rl.train \
|
||||||
|
--bc $(RL_DIR) --out $(RL_FAST_DIR) \
|
||||||
|
--total-timesteps $(RL_FAST_STEPS) --kl-coef $(RL_FAST_KL) \
|
||||||
|
--imitate-weight $(IMITATE) --time-weight $(RL_FAST_TIME_W) \
|
||||||
|
--difficulty $(DIFFICULTY) \
|
||||||
|
--drive-mode $(DRIVE) --world $(WORLD)
|
||||||
|
|
||||||
|
eval_fast: $(RL_FAST_POLICY)
|
||||||
|
$(PY) -m training.eval --policy $(RL_FAST_DIR) \
|
||||||
|
--max-flock 10 --max-steps $(EVAL_MAX_STEPS) --n-seeds $(EVAL_SEEDS) \
|
||||||
|
--drive-mode $(DRIVE) --world $(WORLD)
|
||||||
|
|
||||||
test:
|
test:
|
||||||
$(PY) -m pytest tests/
|
$(PY) -m pytest tests/
|
||||||
|
|
||||||
@@ -161,6 +195,65 @@ train_mec_round:
|
|||||||
|
|
||||||
train_all: train_diff_rect train_diff_round train_mec_rect train_mec_round
|
train_all: train_diff_rect train_diff_round train_mec_rect train_mec_round
|
||||||
|
|
||||||
|
# Gym eval sweep over all 4 combos. Use after train_all / train_all_fast.
|
||||||
|
eval_all:
|
||||||
|
@for d in differential mecanum; do \
|
||||||
|
for w in field field_round; do \
|
||||||
|
echo ""; \
|
||||||
|
echo "=== BC $$d / $$w ==="; \
|
||||||
|
$(PY) -m training.eval --policy training/runs/bc_$${d}_$${w} \
|
||||||
|
--max-flock 10 --max-steps $(EVAL_MAX_STEPS) --n-seeds $(EVAL_SEEDS) \
|
||||||
|
--drive-mode $$d --world $$w; \
|
||||||
|
echo ""; \
|
||||||
|
echo "=== RL $$d / $$w ==="; \
|
||||||
|
$(PY) -m training.eval --policy training/runs/rl_$${d}_$${w} \
|
||||||
|
--max-flock 10 --max-steps $(EVAL_MAX_STEPS) --n-seeds $(EVAL_SEEDS) \
|
||||||
|
--drive-mode $$d --world $$w; \
|
||||||
|
done; \
|
||||||
|
done
|
||||||
|
|
||||||
|
# One-shot remote runbook: clean → Stage-1 train → Stage-1 eval → Stage-2
|
||||||
|
# train → Stage-2 eval. Each step pipes to its own log file in the repo
|
||||||
|
# root so the run is fully unattended.
|
||||||
|
remote_full:
|
||||||
|
$(MAKE) clean_all
|
||||||
|
$(MAKE) train_all 2>&1 | tee stage1_train.log
|
||||||
|
$(MAKE) eval_all 2>&1 | tee stage1_eval.log
|
||||||
|
$(MAKE) train_all_fast 2>&1 | tee stage2_train.log
|
||||||
|
$(MAKE) eval_all_fast 2>&1 | tee stage2_eval.log
|
||||||
|
@echo ""
|
||||||
|
@echo "===================================================="
|
||||||
|
@echo " Done. Logs: stage1_train.log stage1_eval.log"
|
||||||
|
@echo " stage2_train.log stage2_eval.log"
|
||||||
|
@echo "===================================================="
|
||||||
|
|
||||||
|
eval_all_fast:
|
||||||
|
@for d in differential mecanum; do \
|
||||||
|
for w in field field_round; do \
|
||||||
|
echo ""; \
|
||||||
|
echo "=== RL_FAST $$d / $$w ==="; \
|
||||||
|
$(PY) -m training.eval --policy training/runs/rl_fast_$${d}_$${w} \
|
||||||
|
--max-flock 10 --max-steps $(EVAL_MAX_STEPS) --n-seeds $(EVAL_SEEDS) \
|
||||||
|
--drive-mode $$d --world $$w; \
|
||||||
|
done; \
|
||||||
|
done
|
||||||
|
|
||||||
|
# --- Stage-2 sweep ---
|
||||||
|
train_diff_rect_fast:
|
||||||
|
$(MAKE) DRIVE=differential WORLD=field rl_fast
|
||||||
|
|
||||||
|
train_diff_round_fast:
|
||||||
|
$(MAKE) DRIVE=differential WORLD=field_round rl_fast
|
||||||
|
|
||||||
|
train_mec_rect_fast:
|
||||||
|
$(MAKE) DRIVE=mecanum WORLD=field rl_fast
|
||||||
|
|
||||||
|
train_mec_round_fast:
|
||||||
|
$(MAKE) DRIVE=mecanum WORLD=field_round rl_fast
|
||||||
|
|
||||||
|
train_all_fast: train_diff_rect_fast train_diff_round_fast \
|
||||||
|
train_mec_rect_fast train_mec_round_fast
|
||||||
|
|
||||||
help:
|
help:
|
||||||
@echo "Targets:"
|
@echo "Targets:"
|
||||||
@echo " make full pipeline (bc_demos -> bc -> rl -> eval)"
|
@echo " make full pipeline (bc_demos -> bc -> rl -> eval)"
|
||||||
|
|||||||
@@ -0,0 +1,7 @@
|
|||||||
|
make[1]: Entering directory '/run/host/home/johnnyf/Documents/Projects/TIR/project'
|
||||||
|
make DRIVE=differential WORLD=field
|
||||||
|
make[2]: Entering directory '/run/host/home/johnnyf/Documents/Projects/TIR/project'
|
||||||
|
python -m training.eval --policy training/runs/rl_differential_field \
|
||||||
|
--max-flock 10 --max-steps 15000 --n-seeds 10 \
|
||||||
|
--drive-mode differential --world field
|
||||||
|
make[2]: Leaving directory '/run/host/home/johnnyf/Documents/Projects/TIR/project'
|
||||||
Reference in New Issue
Block a user