Checkpoint 2

2026-05-07 22:00:10 +01:00
parent 90aa3bbcb4
commit 1bb9415414
37 changed files with 3068 additions and 2912 deletions
@@ -0,0 +1,52 @@
+# PPO hyperparameters for the herding env. Tuned for a 28-D obs / 2-D
+# continuous action space with 16 parallel envs on GPU. These are SB3
+# defaults nudged toward longer credit assignment (gamma=0.995) and a
+# slightly higher entropy bonus to keep exploration alive while curriculum
+# expands the flock size.
+
+# --- PPO ---
+learning_rate: 3.0e-4
+n_steps: 2048              # rollout length per env before each update
+batch_size: 256
+n_epochs: 10
+gamma: 0.995
+gae_lambda: 0.95
+clip_range: 0.2
+ent_coef: 0.05             # was 0.01 — earlier runs collapsed to ~0 actions
+vf_coef: 0.5
+max_grad_norm: 0.5
+target_kl: null            # disable early-stop on KL
+
+# --- Network ---
+policy: MlpPolicy
+net_arch_pi: [128, 128]
+net_arch_vf: [128, 128]
+log_std_init: 0.5          # std≈1.6 instead of default 1.0 — more exploration
+
+# --- Training schedule ---
+total_timesteps: 10_000_000
+n_envs: 16
+checkpoint_freq: 500_000   # in env steps
+eval_freq: 100_000         # in env steps
+n_eval_episodes: 20
+
+# --- Curriculum (max-n_sheep schedule, in env steps) ---
+# Each entry: at step s, raise the env's max_n_sheep to k. The env samples
+# uniformly from [1, max_n_sheep] each reset, so this widens the
+# distribution gradually rather than swapping fixed sizes.
+#
+# State-space curriculum: difficulty controls sheep spawn area
+# (0 = sheep spawn just north of gate, 1 = sheep spawn anywhere in field).
+# Plus the existing flock-size curriculum.
+#
+# The two together let the policy first learn "what penning looks like"
+# in a regime where random exploration reliably triggers it, then
+# gradually generalise to the deployment distribution.
+curriculum:
+  - { step: 0,          max_n_sheep: 1, difficulty: 0.0 }
+  - { step: 1_000_000,  max_n_sheep: 1, difficulty: 0.3 }
+  - { step: 2_000_000,  max_n_sheep: 2, difficulty: 0.5 }
+  - { step: 4_000_000,  max_n_sheep: 3, difficulty: 0.8 }
+  - { step: 6_000_000,  max_n_sheep: 5, difficulty: 1.0 }
+  - { step: 8_000_000,  max_n_sheep: 8, difficulty: 1.0 }
+  - { step: 9_000_000,  max_n_sheep: 10, difficulty: 1.0 }