# PPO hyperparameters for the herding env. Tuned for a 28-D obs / 2-D # continuous action space with 16 parallel envs on GPU. These are SB3 # defaults nudged toward longer credit assignment (gamma=0.995) and a # slightly higher entropy bonus to keep exploration alive while curriculum # expands the flock size. # --- PPO --- learning_rate: 3.0e-4 n_steps: 2048 # rollout length per env before each update batch_size: 256 n_epochs: 10 gamma: 0.995 gae_lambda: 0.95 clip_range: 0.2 ent_coef: 0.05 # was 0.01 — earlier runs collapsed to ~0 actions vf_coef: 0.5 max_grad_norm: 0.5 target_kl: null # disable early-stop on KL # --- Network --- policy: MlpPolicy net_arch_pi: [128, 128] net_arch_vf: [128, 128] log_std_init: 0.5 # std≈1.6 instead of default 1.0 — more exploration # --- Training schedule --- total_timesteps: 10_000_000 n_envs: 16 checkpoint_freq: 500_000 # in env steps eval_freq: 100_000 # in env steps n_eval_episodes: 20 # --- Curriculum (max-n_sheep schedule, in env steps) --- # Each entry: at step s, raise the env's max_n_sheep to k. The env samples # uniformly from [1, max_n_sheep] each reset, so this widens the # distribution gradually rather than swapping fixed sizes. # # State-space curriculum: difficulty controls sheep spawn area # (0 = sheep spawn just north of gate, 1 = sheep spawn anywhere in field). # Plus the existing flock-size curriculum. # # The two together let the policy first learn "what penning looks like" # in a regime where random exploration reliably triggers it, then # gradually generalise to the deployment distribution. curriculum: - { step: 0, max_n_sheep: 1, difficulty: 0.0 } - { step: 1_000_000, max_n_sheep: 1, difficulty: 0.3 } - { step: 2_000_000, max_n_sheep: 2, difficulty: 0.5 } - { step: 4_000_000, max_n_sheep: 3, difficulty: 0.8 } - { step: 6_000_000, max_n_sheep: 5, difficulty: 1.0 } - { step: 8_000_000, max_n_sheep: 8, difficulty: 1.0 } - { step: 9_000_000, max_n_sheep: 10, difficulty: 1.0 }