Checkpoint 9
This commit is contained in:
@@ -71,12 +71,14 @@ BC_NET_ARCH ?= 512,512
|
||||
ifeq ($(WORLD),field_round)
|
||||
PPO_STEPS ?= 4000000
|
||||
KL ?= 0.02
|
||||
TIME_W ?= 0.0
|
||||
else
|
||||
PPO_STEPS ?= 2000000
|
||||
KL ?= 0.05
|
||||
TIME_W ?= -0.05
|
||||
endif
|
||||
# Time penalty is 0 until success rate is high. Earlier runs showed
|
||||
# TIME_W=-0.05 traded ~10 pts of success for speed on hard combos —
|
||||
# learn to succeed first, optimize speed in a later pass.
|
||||
TIME_W ?= 0.0
|
||||
IMITATE ?= 0.0
|
||||
# PPO rollouts at full difficulty so the training distribution matches
|
||||
# eval (deployment). Anything lower causes a train/eval mismatch that
|
||||
|
||||
Reference in New Issue
Block a user