Checkpoint 9

2026-05-13 13:46:50 +01:00
parent be58ad2054
commit 683de740af
2 changed files with 24 additions and 9 deletions
@@ -71,12 +71,14 @@ BC_NET_ARCH      ?= 512,512
 ifeq ($(WORLD),field_round)
 PPO_STEPS        ?= 4000000
 KL               ?= 0.02
-TIME_W           ?= 0.0
 else
 PPO_STEPS        ?= 2000000
 KL               ?= 0.05
-TIME_W           ?= -0.05
 endif
+# Time penalty is 0 until success rate is high. Earlier runs showed
+# TIME_W=-0.05 traded ~10 pts of success for speed on hard combos —
+# learn to succeed first, optimize speed in a later pass.
+TIME_W           ?= 0.0
 IMITATE          ?= 0.0
 # PPO rollouts at full difficulty so the training distribution matches
 # eval (deployment).  Anything lower causes a train/eval mismatch that