# PPO hyperparameters for the herding env. Tuned for a 28-D obs / 2-D
# continuous action space with 16 parallel envs on GPU. These are SB3
# defaults nudged toward longer credit assignment (gamma=0.995) and a
# slightly higher entropy bonus to keep exploration alive while curriculum
# expands the flock size.

# --- PPO ---
learning_rate: 3.0e-4
n_steps: 2048              # rollout length per env before each update
batch_size: 256
n_epochs: 10
gamma: 0.995
gae_lambda: 0.95
clip_range: 0.2
ent_coef: 0.05             # was 0.01 — earlier runs collapsed to ~0 actions
vf_coef: 0.5
max_grad_norm: 0.5
target_kl: null            # disable early-stop on KL

# --- Network ---
policy: MlpPolicy
net_arch_pi: [128, 128]
net_arch_vf: [128, 128]
log_std_init: 0.5          # std≈1.6 instead of default 1.0 — more exploration

# --- Training schedule ---
total_timesteps: 10_000_000
n_envs: 16
checkpoint_freq: 500_000   # in env steps
eval_freq: 100_000         # in env steps
n_eval_episodes: 20

# --- Curriculum (max-n_sheep schedule, in env steps) ---
# Each entry: at step s, raise the env's max_n_sheep to k. The env samples
# uniformly from [1, max_n_sheep] each reset, so this widens the
# distribution gradually rather than swapping fixed sizes.
#
# State-space curriculum: difficulty controls sheep spawn area
# (0 = sheep spawn just north of gate, 1 = sheep spawn anywhere in field).
# Plus the existing flock-size curriculum.
#
# The two together let the policy first learn "what penning looks like"
# in a regime where random exploration reliably triggers it, then
# gradually generalise to the deployment distribution.
curriculum:
  - { step: 0,          max_n_sheep: 1, difficulty: 0.0 }
  - { step: 1_000_000,  max_n_sheep: 1, difficulty: 0.3 }
  - { step: 2_000_000,  max_n_sheep: 2, difficulty: 0.5 }
  - { step: 4_000_000,  max_n_sheep: 3, difficulty: 0.8 }
  - { step: 6_000_000,  max_n_sheep: 5, difficulty: 1.0 }
  - { step: 8_000_000,  max_n_sheep: 8, difficulty: 1.0 }
  - { step: 9_000_000,  max_n_sheep: 10, difficulty: 1.0 }