diff --git a/training/runs/ppo_fix_check2.log b/training/runs/ppo_fix_check2.log new file mode 100644 index 0000000..a345ff5 --- /dev/null +++ b/training/runs/ppo_fix_check2.log @@ -0,0 +1,3391 @@ +Using cpu device +Logging to runs/ppo_fix_check2/ppo_1 +------------------------------ +| time/ | | +| fps | 4605 | +| iterations | 1 | +| time_elapsed | 3 | +| total_timesteps | 16384 | +------------------------------ +------------------------------------------ +| time/ | | +| fps | 4011 | +| iterations | 2 | +| time_elapsed | 8 | +| total_timesteps | 32768 | +| train/ | | +| approx_kl | 0.0033352287 | +| clip_fraction | 0.0253 | +| clip_range | 0.2 | +| entropy_loss | -2.83 | +| explained_variance | 0.271 | +| learning_rate | 0.0003 | +| loss | -0.00687 | +| n_updates | 10 | +| policy_gradient_loss | -0.00103 | +| std | 0.996 | +| value_loss | 0.0684 | +------------------------------------------ +----------------------------------------- +| time/ | | +| fps | 3789 | +| iterations | 3 | +| time_elapsed | 12 | +| total_timesteps | 49152 | +| train/ | | +| approx_kl | 0.005950423 | +| clip_fraction | 0.0552 | +| clip_range | 0.2 | +| entropy_loss | -2.83 | +| explained_variance | 0.527 | +| learning_rate | 0.0003 | +| loss | -0.0153 | +| n_updates | 20 | +| policy_gradient_loss | -0.0029 | +| std | 0.997 | +| value_loss | 0.0663 | +----------------------------------------- +/home/jalf/miniconda3/envs/tir/lib/python3.12/site-packages/stable_baselines3/common/evaluation.py:71: UserWarning: Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper. + warnings.warn( +Eval num_timesteps=50000, episode_reward=-25.68 +/- 59.67 +Episode length: 1815.95 +/- 456.88 +------------------------------------------ +| eval/ | | +| mean_ep_length | 1.82e+03 | +| mean_reward | -25.7 | +| time/ | | +| total_timesteps | 50000 | +| train/ | | +| approx_kl | 0.0040030424 | +| clip_fraction | 0.0356 | +| clip_range | 0.2 | +| entropy_loss | -2.85 | +| explained_variance | 0.421 | +| learning_rate | 0.0003 | +| loss | 0.149 | +| n_updates | 30 | +| policy_gradient_loss | -0.00198 | +| std | 1.01 | +| value_loss | 0.114 | +------------------------------------------ +New best mean reward! +------------------------------ +| time/ | | +| fps | 2351 | +| iterations | 4 | +| time_elapsed | 27 | +| total_timesteps | 65536 | +------------------------------ +----------------------------------------- +| time/ | | +| fps | 2446 | +| iterations | 5 | +| time_elapsed | 33 | +| total_timesteps | 81920 | +| train/ | | +| approx_kl | 0.005522004 | +| clip_fraction | 0.0604 | +| clip_range | 0.2 | +| entropy_loss | -2.86 | +| explained_variance | 0.737 | +| learning_rate | 0.0003 | +| loss | -0.0301 | +| n_updates | 40 | +| policy_gradient_loss | -0.00434 | +| std | 1.01 | +| value_loss | 0.0164 | +----------------------------------------- +------------------------------------------ +| time/ | | +| fps | 2617 | +| iterations | 6 | +| time_elapsed | 37 | +| total_timesteps | 98304 | +| train/ | | +| approx_kl | 0.0052388343 | +| clip_fraction | 0.0463 | +| clip_range | 0.2 | +| entropy_loss | -2.86 | +| explained_variance | 0.626 | +| learning_rate | 0.0003 | +| loss | -0.0294 | +| n_updates | 50 | +| policy_gradient_loss | -0.00297 | +| std | 1.01 | +| value_loss | 0.0597 | +------------------------------------------ +/home/jalf/miniconda3/envs/tir/lib/python3.12/site-packages/stable_baselines3/common/evaluation.py:71: UserWarning: Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper. + warnings.warn( +Eval num_timesteps=100000, episode_reward=-22.76 +/- 46.60 +Episode length: 1900.95 +/- 430.60 +----------------------------------------- +| eval/ | | +| mean_ep_length | 1.9e+03 | +| mean_reward | -22.8 | +| time/ | | +| total_timesteps | 100000 | +| train/ | | +| approx_kl | 0.005612197 | +| clip_fraction | 0.0475 | +| clip_range | 0.2 | +| entropy_loss | -2.86 | +| explained_variance | 0.747 | +| learning_rate | 0.0003 | +| loss | -0.0261 | +| n_updates | 60 | +| policy_gradient_loss | -0.00393 | +| std | 1.01 | +| value_loss | 0.0517 | +----------------------------------------- +New best mean reward! +------------------------------- +| time/ | | +| fps | 2178 | +| iterations | 7 | +| time_elapsed | 52 | +| total_timesteps | 114688 | +------------------------------- +------------------------------------------ +| time/ | | +| fps | 2294 | +| iterations | 8 | +| time_elapsed | 57 | +| total_timesteps | 131072 | +| train/ | | +| approx_kl | 0.0057119504 | +| clip_fraction | 0.0541 | +| clip_range | 0.2 | +| entropy_loss | -2.85 | +| explained_variance | 0.896 | +| learning_rate | 0.0003 | +| loss | -0.0144 | +| n_updates | 70 | +| policy_gradient_loss | -0.00364 | +| std | 1 | +| value_loss | 0.0738 | +------------------------------------------ +----------------------------------------- +| time/ | | +| fps | 2393 | +| iterations | 9 | +| time_elapsed | 61 | +| total_timesteps | 147456 | +| train/ | | +| approx_kl | 0.005940904 | +| clip_fraction | 0.0565 | +| clip_range | 0.2 | +| entropy_loss | -2.85 | +| explained_variance | 0.89 | +| learning_rate | 0.0003 | +| loss | -0.0283 | +| n_updates | 80 | +| policy_gradient_loss | -0.00245 | +| std | 1.01 | +| value_loss | 0.0761 | +----------------------------------------- +Eval num_timesteps=150000, episode_reward=-29.37 +/- 28.32 +Episode length: 1997.50 +/- 10.90 +----------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -29.4 | +| time/ | | +| total_timesteps | 150000 | +| train/ | | +| approx_kl | 0.004531667 | +| clip_fraction | 0.0392 | +| clip_range | 0.2 | +| entropy_loss | -2.85 | +| explained_variance | 0.958 | +| learning_rate | 0.0003 | +| loss | -0.0343 | +| n_updates | 90 | +| policy_gradient_loss | -0.00379 | +| std | 1.01 | +| value_loss | 0.00995 | +----------------------------------------- + +[Diag @ 150,000 | n_sheep=1 | success=0%] + COMPACT_CANT_DRIVE 17/20 + DROVE_NO_SHEEP 3/20 + action_mag mean=0.089 p10=0.003 p90=0.274 (0=stopped, 1=full speed) + min_flock_radius mean=0.00m best=0.00m (target <5m to compact) + min_dog_to_com mean=4.40m best=2.07m (FLEE_DIST=7m) + min_com_to_pen mean=11.66m best=1.50m + reward/step (mean): progress=+0.0004 alignment=+0.0000 pen_bonus=+0.0000 step_cost=-0.0200 complete=+0.0000 +------------------------------- +| time/ | | +| fps | 1950 | +| iterations | 10 | +| time_elapsed | 84 | +| total_timesteps | 163840 | +------------------------------- +------------------------------------------ +| time/ | | +| fps | 2020 | +| iterations | 11 | +| time_elapsed | 89 | +| total_timesteps | 180224 | +| train/ | | +| approx_kl | 0.0061831754 | +| clip_fraction | 0.068 | +| clip_range | 0.2 | +| entropy_loss | -2.86 | +| explained_variance | 0.975 | +| learning_rate | 0.0003 | +| loss | -0.0349 | +| n_updates | 100 | +| policy_gradient_loss | -0.00607 | +| std | 1.02 | +| value_loss | 0.0156 | +------------------------------------------ +----------------------------------------- +| time/ | | +| fps | 2084 | +| iterations | 12 | +| time_elapsed | 94 | +| total_timesteps | 196608 | +| train/ | | +| approx_kl | 0.009407628 | +| clip_fraction | 0.123 | +| clip_range | 0.2 | +| entropy_loss | -2.87 | +| explained_variance | 0.899 | +| learning_rate | 0.0003 | +| loss | -0.0305 | +| n_updates | 110 | +| policy_gradient_loss | -0.00932 | +| std | 1.02 | +| value_loss | 0.0223 | +----------------------------------------- +Eval num_timesteps=200000, episode_reward=-12.36 +/- 51.37 +Episode length: 1880.20 +/- 355.04 +----------------------------------------- +| eval/ | | +| mean_ep_length | 1.88e+03 | +| mean_reward | -12.4 | +| time/ | | +| total_timesteps | 200000 | +| train/ | | +| approx_kl | 0.008270489 | +| clip_fraction | 0.0945 | +| clip_range | 0.2 | +| entropy_loss | -2.85 | +| explained_variance | 0.945 | +| learning_rate | 0.0003 | +| loss | -0.0339 | +| n_updates | 120 | +| policy_gradient_loss | -0.00809 | +| std | 1 | +| value_loss | 0.0162 | +----------------------------------------- +New best mean reward! +------------------------------- +| time/ | | +| fps | 1936 | +| iterations | 13 | +| time_elapsed | 109 | +| total_timesteps | 212992 | +------------------------------- +----------------------------------------- +| time/ | | +| fps | 1989 | +| iterations | 14 | +| time_elapsed | 115 | +| total_timesteps | 229376 | +| train/ | | +| approx_kl | 0.008541125 | +| clip_fraction | 0.112 | +| clip_range | 0.2 | +| entropy_loss | -2.83 | +| explained_variance | 0.944 | +| learning_rate | 0.0003 | +| loss | -0.0184 | +| n_updates | 130 | +| policy_gradient_loss | -0.00846 | +| std | 0.994 | +| value_loss | 0.0284 | +----------------------------------------- +---------------------------------------- +| time/ | | +| fps | 2037 | +| iterations | 15 | +| time_elapsed | 120 | +| total_timesteps | 245760 | +| train/ | | +| approx_kl | 0.00763176 | +| clip_fraction | 0.0894 | +| clip_range | 0.2 | +| entropy_loss | -2.81 | +| explained_variance | 0.9 | +| learning_rate | 0.0003 | +| loss | -0.0128 | +| n_updates | 140 | +| policy_gradient_loss | -0.00655 | +| std | 0.987 | +| value_loss | 0.071 | +---------------------------------------- +Eval num_timesteps=250000, episode_reward=45.82 +/- 68.33 +Episode length: 1391.70 +/- 757.58 +----------------------------------------- +| eval/ | | +| mean_ep_length | 1.39e+03 | +| mean_reward | 45.8 | +| time/ | | +| total_timesteps | 250000 | +| train/ | | +| approx_kl | 0.009210973 | +| clip_fraction | 0.11 | +| clip_range | 0.2 | +| entropy_loss | -2.81 | +| explained_variance | 0.95 | +| learning_rate | 0.0003 | +| loss | -0.0401 | +| n_updates | 150 | +| policy_gradient_loss | -0.0082 | +| std | 0.986 | +| value_loss | 0.0202 | +----------------------------------------- +New best mean reward! +------------------------------- +| time/ | | +| fps | 1958 | +| iterations | 16 | +| time_elapsed | 133 | +| total_timesteps | 262144 | +------------------------------- +----------------------------------------- +| time/ | | +| fps | 2005 | +| iterations | 17 | +| time_elapsed | 138 | +| total_timesteps | 278528 | +| train/ | | +| approx_kl | 0.008197077 | +| clip_fraction | 0.096 | +| clip_range | 0.2 | +| entropy_loss | -2.79 | +| explained_variance | 0.949 | +| learning_rate | 0.0003 | +| loss | -0.0375 | +| n_updates | 160 | +| policy_gradient_loss | -0.00834 | +| std | 0.976 | +| value_loss | 0.0207 | +----------------------------------------- +----------------------------------------- +| time/ | | +| fps | 2061 | +| iterations | 18 | +| time_elapsed | 143 | +| total_timesteps | 294912 | +| train/ | | +| approx_kl | 0.006078005 | +| clip_fraction | 0.0598 | +| clip_range | 0.2 | +| entropy_loss | -2.78 | +| explained_variance | 0.965 | +| learning_rate | 0.0003 | +| loss | -0.0188 | +| n_updates | 170 | +| policy_gradient_loss | -0.00464 | +| std | 0.969 | +| value_loss | 0.0178 | +----------------------------------------- +Eval num_timesteps=300000, episode_reward=56.19 +/- 63.26 +Episode length: 1246.75 +/- 843.82 +------------------------------------------ +| eval/ | | +| mean_ep_length | 1.25e+03 | +| mean_reward | 56.2 | +| time/ | | +| total_timesteps | 300000 | +| train/ | | +| approx_kl | 0.0056289425 | +| clip_fraction | 0.0523 | +| clip_range | 0.2 | +| entropy_loss | -2.76 | +| explained_variance | 0.969 | +| learning_rate | 0.0003 | +| loss | -0.0246 | +| n_updates | 180 | +| policy_gradient_loss | -0.00378 | +| std | 0.961 | +| value_loss | 0.0174 | +------------------------------------------ +New best mean reward! + +[Diag @ 300,000 | n_sheep=1 | success=40%] + DROVE_NO_SHEEP 11/20 + SUCCESS 8/20 + COMPACT_CANT_DRIVE 1/20 + action_mag mean=0.076 p10=0.000 p90=0.193 (0=stopped, 1=full speed) + min_flock_radius mean=0.00m best=0.00m (target <5m to compact) + min_dog_to_com mean=2.83m best=0.24m (FLEE_DIST=7m) + min_com_to_pen mean=2.99m best=1.50m + reward/step (mean): progress=+0.0236 alignment=+0.0012 pen_bonus=+0.0029 step_cost=-0.0200 complete=+0.0291 +------------------------------- +| time/ | | +| fps | 1939 | +| iterations | 19 | +| time_elapsed | 160 | +| total_timesteps | 311296 | +------------------------------- +----------------------------------------- +| time/ | | +| fps | 1983 | +| iterations | 20 | +| time_elapsed | 165 | +| total_timesteps | 327680 | +| train/ | | +| approx_kl | 0.005042998 | +| clip_fraction | 0.05 | +| clip_range | 0.2 | +| entropy_loss | -2.73 | +| explained_variance | 0.941 | +| learning_rate | 0.0003 | +| loss | -0.0242 | +| n_updates | 190 | +| policy_gradient_loss | -0.00399 | +| std | 0.947 | +| value_loss | 0.00505 | +----------------------------------------- +------------------------------------------ +| time/ | | +| fps | 2018 | +| iterations | 21 | +| time_elapsed | 170 | +| total_timesteps | 344064 | +| train/ | | +| approx_kl | 0.0054986854 | +| clip_fraction | 0.0569 | +| clip_range | 0.2 | +| entropy_loss | -2.72 | +| explained_variance | 0.942 | +| learning_rate | 0.0003 | +| loss | -0.0248 | +| n_updates | 200 | +| policy_gradient_loss | -0.00415 | +| std | 0.941 | +| value_loss | 0.00784 | +------------------------------------------ +Eval num_timesteps=350000, episode_reward=25.08 +/- 61.55 +Episode length: 1562.00 +/- 761.23 +------------------------------------------ +| eval/ | | +| mean_ep_length | 1.56e+03 | +| mean_reward | 25.1 | +| time/ | | +| total_timesteps | 350000 | +| train/ | | +| approx_kl | 0.0046333643 | +| clip_fraction | 0.0476 | +| clip_range | 0.2 | +| entropy_loss | -2.71 | +| explained_variance | 0.934 | +| learning_rate | 0.0003 | +| loss | -0.0244 | +| n_updates | 210 | +| policy_gradient_loss | -0.00237 | +| std | 0.934 | +| value_loss | 0.00827 | +------------------------------------------ +------------------------------- +| time/ | | +| fps | 1950 | +| iterations | 22 | +| time_elapsed | 184 | +| total_timesteps | 360448 | +------------------------------- +----------------------------------------- +| time/ | | +| fps | 1990 | +| iterations | 23 | +| time_elapsed | 189 | +| total_timesteps | 376832 | +| train/ | | +| approx_kl | 0.006686668 | +| clip_fraction | 0.0757 | +| clip_range | 0.2 | +| entropy_loss | -2.7 | +| explained_variance | 0.963 | +| learning_rate | 0.0003 | +| loss | -0.0423 | +| n_updates | 220 | +| policy_gradient_loss | -0.00244 | +| std | 0.936 | +| value_loss | 0.00575 | +----------------------------------------- +----------------------------------------- +| time/ | | +| fps | 2027 | +| iterations | 24 | +| time_elapsed | 193 | +| total_timesteps | 393216 | +| train/ | | +| approx_kl | 0.009116547 | +| clip_fraction | 0.103 | +| clip_range | 0.2 | +| entropy_loss | -2.71 | +| explained_variance | 0.97 | +| learning_rate | 0.0003 | +| loss | -0.0353 | +| n_updates | 230 | +| policy_gradient_loss | -0.0042 | +| std | 0.941 | +| value_loss | 0.006 | +----------------------------------------- +Eval num_timesteps=400000, episode_reward=56.91 +/- 71.91 +Episode length: 1225.25 +/- 861.21 +------------------------------------------ +| eval/ | | +| mean_ep_length | 1.23e+03 | +| mean_reward | 56.9 | +| time/ | | +| total_timesteps | 400000 | +| train/ | | +| approx_kl | 0.0061917743 | +| clip_fraction | 0.0658 | +| clip_range | 0.2 | +| entropy_loss | -2.72 | +| explained_variance | 0.975 | +| learning_rate | 0.0003 | +| loss | -0.0378 | +| n_updates | 240 | +| policy_gradient_loss | -0.00282 | +| std | 0.943 | +| value_loss | 0.00633 | +------------------------------------------ +New best mean reward! +------------------------------- +| time/ | | +| fps | 1981 | +| iterations | 25 | +| time_elapsed | 206 | +| total_timesteps | 409600 | +------------------------------- +----------------------------------------- +| time/ | | +| fps | 2011 | +| iterations | 26 | +| time_elapsed | 211 | +| total_timesteps | 425984 | +| train/ | | +| approx_kl | 0.007945089 | +| clip_fraction | 0.1 | +| clip_range | 0.2 | +| entropy_loss | -2.73 | +| explained_variance | 0.978 | +| learning_rate | 0.0003 | +| loss | -0.0343 | +| n_updates | 250 | +| policy_gradient_loss | -0.00475 | +| std | 0.95 | +| value_loss | 0.00708 | +----------------------------------------- +----------------------------------------- +| time/ | | +| fps | 2044 | +| iterations | 27 | +| time_elapsed | 216 | +| total_timesteps | 442368 | +| train/ | | +| approx_kl | 0.013059773 | +| clip_fraction | 0.152 | +| clip_range | 0.2 | +| entropy_loss | -2.76 | +| explained_variance | 0.984 | +| learning_rate | 0.0003 | +| loss | -0.0421 | +| n_updates | 260 | +| policy_gradient_loss | -0.00542 | +| std | 0.967 | +| value_loss | 0.00331 | +----------------------------------------- +Eval num_timesteps=450000, episode_reward=58.80 +/- 74.46 +Episode length: 1123.15 +/- 881.85 +------------------------------------------ +| eval/ | | +| mean_ep_length | 1.12e+03 | +| mean_reward | 58.8 | +| time/ | | +| total_timesteps | 450000 | +| train/ | | +| approx_kl | 0.0085322345 | +| clip_fraction | 0.0967 | +| clip_range | 0.2 | +| entropy_loss | -2.77 | +| explained_variance | 0.98 | +| learning_rate | 0.0003 | +| loss | -0.0264 | +| n_updates | 270 | +| policy_gradient_loss | -0.00612 | +| std | 0.963 | +| value_loss | 0.00919 | +------------------------------------------ +New best mean reward! + +[Diag @ 450,000 | n_sheep=1 | success=65%] + SUCCESS 13/20 + DROVE_NO_SHEEP 4/20 + COMPACT_CANT_DRIVE 3/20 + action_mag mean=0.105 p10=0.000 p90=0.272 (0=stopped, 1=full speed) + min_flock_radius mean=0.00m best=0.00m (target <5m to compact) + min_dog_to_com mean=1.67m best=0.43m (FLEE_DIST=7m) + min_com_to_pen mean=3.26m best=2.29m + reward/step (mean): progress=+0.0326 alignment=+0.0024 pen_bonus=+0.0076 step_cost=-0.0200 complete=+0.0762 +------------------------------- +| time/ | | +| fps | 1974 | +| iterations | 28 | +| time_elapsed | 232 | +| total_timesteps | 458752 | +------------------------------- +---------------------------------------- +| time/ | | +| fps | 2005 | +| iterations | 29 | +| time_elapsed | 236 | +| total_timesteps | 475136 | +| train/ | | +| approx_kl | 0.01203198 | +| clip_fraction | 0.146 | +| clip_range | 0.2 | +| entropy_loss | -2.79 | +| explained_variance | 0.963 | +| learning_rate | 0.0003 | +| loss | 0.00738 | +| n_updates | 280 | +| policy_gradient_loss | -0.0128 | +| std | 0.982 | +| value_loss | 0.0749 | +---------------------------------------- +------------------------------------------ +| time/ | | +| fps | 2037 | +| iterations | 30 | +| time_elapsed | 241 | +| total_timesteps | 491520 | +| train/ | | +| approx_kl | 0.0078244675 | +| clip_fraction | 0.0856 | +| clip_range | 0.2 | +| entropy_loss | -2.8 | +| explained_variance | 0.937 | +| learning_rate | 0.0003 | +| loss | 0.0631 | +| n_updates | 290 | +| policy_gradient_loss | -0.00651 | +| std | 0.977 | +| value_loss | 0.131 | +------------------------------------------ +Eval num_timesteps=500000, episode_reward=135.29 +/- 9.81 +Episode length: 287.30 +/- 88.71 +---------------------------------------- +| eval/ | | +| mean_ep_length | 287 | +| mean_reward | 135 | +| time/ | | +| total_timesteps | 500000 | +| train/ | | +| approx_kl | 0.00837522 | +| clip_fraction | 0.0866 | +| clip_range | 0.2 | +| entropy_loss | -2.77 | +| explained_variance | 0.948 | +| learning_rate | 0.0003 | +| loss | 0.041 | +| n_updates | 300 | +| policy_gradient_loss | -0.00532 | +| std | 0.962 | +| value_loss | 0.0898 | +---------------------------------------- +New best mean reward! +------------------------------- +| time/ | | +| fps | 2048 | +| iterations | 31 | +| time_elapsed | 247 | +| total_timesteps | 507904 | +------------------------------- +------------------------------------------ +| time/ | | +| fps | 2070 | +| iterations | 32 | +| time_elapsed | 253 | +| total_timesteps | 524288 | +| train/ | | +| approx_kl | 0.0067581255 | +| clip_fraction | 0.0543 | +| clip_range | 0.2 | +| entropy_loss | -2.75 | +| explained_variance | 0.932 | +| learning_rate | 0.0003 | +| loss | 0.0518 | +| n_updates | 310 | +| policy_gradient_loss | -0.00297 | +| std | 0.954 | +| value_loss | 0.111 | +------------------------------------------ +------------------------------------------ +| time/ | | +| fps | 2090 | +| iterations | 33 | +| time_elapsed | 258 | +| total_timesteps | 540672 | +| train/ | | +| approx_kl | 0.0066835573 | +| clip_fraction | 0.0597 | +| clip_range | 0.2 | +| entropy_loss | -2.74 | +| explained_variance | 0.934 | +| learning_rate | 0.0003 | +| loss | 0.00545 | +| n_updates | 320 | +| policy_gradient_loss | -0.00508 | +| std | 0.949 | +| value_loss | 0.101 | +------------------------------------------ +Eval num_timesteps=550000, episode_reward=136.08 +/- 11.93 +Episode length: 285.80 +/- 123.59 +------------------------------------------ +| eval/ | | +| mean_ep_length | 286 | +| mean_reward | 136 | +| time/ | | +| total_timesteps | 550000 | +| train/ | | +| approx_kl | 0.0062076193 | +| clip_fraction | 0.0672 | +| clip_range | 0.2 | +| entropy_loss | -2.71 | +| explained_variance | 0.942 | +| learning_rate | 0.0003 | +| loss | 0.0229 | +| n_updates | 330 | +| policy_gradient_loss | -0.00616 | +| std | 0.933 | +| value_loss | 0.0813 | +------------------------------------------ +New best mean reward! +------------------------------- +| time/ | | +| fps | 2104 | +| iterations | 34 | +| time_elapsed | 264 | +| total_timesteps | 557056 | +------------------------------- +------------------------------------------ +| time/ | | +| fps | 2130 | +| iterations | 35 | +| time_elapsed | 269 | +| total_timesteps | 573440 | +| train/ | | +| approx_kl | 0.0064913128 | +| clip_fraction | 0.0631 | +| clip_range | 0.2 | +| entropy_loss | -2.67 | +| explained_variance | 0.971 | +| learning_rate | 0.0003 | +| loss | -0.0199 | +| n_updates | 340 | +| policy_gradient_loss | -0.00631 | +| std | 0.917 | +| value_loss | 0.0185 | +------------------------------------------ +------------------------------------------ +| time/ | | +| fps | 2155 | +| iterations | 36 | +| time_elapsed | 273 | +| total_timesteps | 589824 | +| train/ | | +| approx_kl | 0.0067110434 | +| clip_fraction | 0.0719 | +| clip_range | 0.2 | +| entropy_loss | -2.63 | +| explained_variance | 0.98 | +| learning_rate | 0.0003 | +| loss | -0.0343 | +| n_updates | 350 | +| policy_gradient_loss | -0.0069 | +| std | 0.897 | +| value_loss | 0.0113 | +------------------------------------------ +Eval num_timesteps=600000, episode_reward=135.45 +/- 12.96 +Episode length: 273.05 +/- 118.26 +------------------------------------------ +| eval/ | | +| mean_ep_length | 273 | +| mean_reward | 135 | +| time/ | | +| total_timesteps | 600000 | +| train/ | | +| approx_kl | 0.0054842415 | +| clip_fraction | 0.0564 | +| clip_range | 0.2 | +| entropy_loss | -2.59 | +| explained_variance | 0.983 | +| learning_rate | 0.0003 | +| loss | -0.033 | +| n_updates | 360 | +| policy_gradient_loss | -0.0042 | +| std | 0.883 | +| value_loss | 0.00479 | +------------------------------------------ + +[Diag @ 600,000 | n_sheep=1 | success=100%] + SUCCESS 20/20 + action_mag mean=0.343 p10=0.232 p90=0.548 (0=stopped, 1=full speed) + min_flock_radius mean=0.00m best=0.00m (target <5m to compact) + min_dog_to_com mean=1.53m best=0.76m (FLEE_DIST=7m) + min_com_to_pen mean=3.49m best=2.84m + reward/step (mean): progress=+0.1066 alignment=+0.0088 pen_bonus=+0.0357 step_cost=-0.0200 complete=+0.3567 + +[Curriculum] leaving stage n_sheep=1 after 600,000 steps | training success rate (last 100 eps) = 100% +[Curriculum] → 2 sheep at step 600,000 + +------------------------------- +| time/ | | +| fps | 2156 | +| iterations | 37 | +| time_elapsed | 281 | +| total_timesteps | 606208 | +------------------------------- +----------------------------------------- +| time/ | | +| fps | 2173 | +| iterations | 38 | +| time_elapsed | 286 | +| total_timesteps | 622592 | +| train/ | | +| approx_kl | 0.011170821 | +| clip_fraction | 0.117 | +| clip_range | 0.2 | +| entropy_loss | -2.59 | +| explained_variance | 0.924 | +| learning_rate | 0.0003 | +| loss | -0.0137 | +| n_updates | 370 | +| policy_gradient_loss | 0.00714 | +| std | 0.886 | +| value_loss | 0.0417 | +----------------------------------------- +----------------------------------------- +| time/ | | +| fps | 2192 | +| iterations | 39 | +| time_elapsed | 291 | +| total_timesteps | 638976 | +| train/ | | +| approx_kl | 0.012632904 | +| clip_fraction | 0.156 | +| clip_range | 0.2 | +| entropy_loss | -2.6 | +| explained_variance | 0.858 | +| learning_rate | 0.0003 | +| loss | -0.00445 | +| n_updates | 380 | +| policy_gradient_loss | 0.00112 | +| std | 0.892 | +| value_loss | 0.0156 | +----------------------------------------- +Eval num_timesteps=650000, episode_reward=-38.36 +/- 29.94 +Episode length: 2000.00 +/- 0.00 +----------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -38.4 | +| time/ | | +| total_timesteps | 650000 | +| train/ | | +| approx_kl | 0.012015635 | +| clip_fraction | 0.133 | +| clip_range | 0.2 | +| entropy_loss | -2.62 | +| explained_variance | 0.946 | +| learning_rate | 0.0003 | +| loss | -0.0168 | +| n_updates | 390 | +| policy_gradient_loss | -0.000726 | +| std | 0.904 | +| value_loss | 0.0126 | +----------------------------------------- +------------------------------- +| time/ | | +| fps | 2131 | +| iterations | 40 | +| time_elapsed | 307 | +| total_timesteps | 655360 | +------------------------------- +----------------------------------------- +| time/ | | +| fps | 2145 | +| iterations | 41 | +| time_elapsed | 313 | +| total_timesteps | 671744 | +| train/ | | +| approx_kl | 0.009391339 | +| clip_fraction | 0.121 | +| clip_range | 0.2 | +| entropy_loss | -2.63 | +| explained_variance | 0.955 | +| learning_rate | 0.0003 | +| loss | -0.0164 | +| n_updates | 400 | +| policy_gradient_loss | -0.00177 | +| std | 0.905 | +| value_loss | 0.00536 | +----------------------------------------- +------------------------------------------ +| time/ | | +| fps | 2156 | +| iterations | 42 | +| time_elapsed | 319 | +| total_timesteps | 688128 | +| train/ | | +| approx_kl | 0.0077482145 | +| clip_fraction | 0.0977 | +| clip_range | 0.2 | +| entropy_loss | -2.64 | +| explained_variance | 0.895 | +| learning_rate | 0.0003 | +| loss | -0.023 | +| n_updates | 410 | +| policy_gradient_loss | -0.00158 | +| std | 0.908 | +| value_loss | 0.0068 | +------------------------------------------ +Eval num_timesteps=700000, episode_reward=-16.26 +/- 48.54 +Episode length: 1934.20 +/- 286.82 +----------------------------------------- +| eval/ | | +| mean_ep_length | 1.93e+03 | +| mean_reward | -16.3 | +| time/ | | +| total_timesteps | 700000 | +| train/ | | +| approx_kl | 0.007948186 | +| clip_fraction | 0.0933 | +| clip_range | 0.2 | +| entropy_loss | -2.64 | +| explained_variance | 0.934 | +| learning_rate | 0.0003 | +| loss | -0.0205 | +| n_updates | 420 | +| policy_gradient_loss | -0.00233 | +| std | 0.904 | +| value_loss | 0.00556 | +----------------------------------------- +------------------------------- +| time/ | | +| fps | 2093 | +| iterations | 43 | +| time_elapsed | 336 | +| total_timesteps | 704512 | +------------------------------- +------------------------------------------ +| time/ | | +| fps | 2109 | +| iterations | 44 | +| time_elapsed | 341 | +| total_timesteps | 720896 | +| train/ | | +| approx_kl | 0.0077707805 | +| clip_fraction | 0.101 | +| clip_range | 0.2 | +| entropy_loss | -2.64 | +| explained_variance | 0.929 | +| learning_rate | 0.0003 | +| loss | -0.00469 | +| n_updates | 430 | +| policy_gradient_loss | -0.00226 | +| std | 0.909 | +| value_loss | 0.0031 | +------------------------------------------ +------------------------------------------ +| time/ | | +| fps | 2129 | +| iterations | 45 | +| time_elapsed | 346 | +| total_timesteps | 737280 | +| train/ | | +| approx_kl | 0.0063995067 | +| clip_fraction | 0.0823 | +| clip_range | 0.2 | +| entropy_loss | -2.66 | +| explained_variance | 0.951 | +| learning_rate | 0.0003 | +| loss | -0.0249 | +| n_updates | 440 | +| policy_gradient_loss | -0.00261 | +| std | 0.922 | +| value_loss | 0.00343 | +------------------------------------------ +Eval num_timesteps=750000, episode_reward=-12.10 +/- 56.78 +Episode length: 1850.50 +/- 449.09 +------------------------------------------ +| eval/ | | +| mean_ep_length | 1.85e+03 | +| mean_reward | -12.1 | +| time/ | | +| total_timesteps | 750000 | +| train/ | | +| approx_kl | 0.0069549307 | +| clip_fraction | 0.0847 | +| clip_range | 0.2 | +| entropy_loss | -2.68 | +| explained_variance | 0.862 | +| learning_rate | 0.0003 | +| loss | -0.0192 | +| n_updates | 450 | +| policy_gradient_loss | -0.00165 | +| std | 0.929 | +| value_loss | 0.0032 | +------------------------------------------ + +[Diag @ 750,000 | n_sheep=2 | success=5%] + COMPACT_CANT_DRIVE 9/20 + NEVER_COMPACT 9/20 + PARTIAL_1of2 1/20 + SUCCESS 1/20 + action_mag mean=0.261 p10=0.002 p90=0.983 (0=stopped, 1=full speed) + min_flock_radius mean=3.93m best=0.00m (target <5m to compact) + min_dog_to_com mean=0.79m best=0.07m (FLEE_DIST=7m) + min_com_to_pen mean=13.43m best=1.62m + reward/step (mean): progress=-0.0058 alignment=+0.0087 pen_bonus=+0.0008 step_cost=-0.0200 complete=+0.0025 +------------------------------- +| time/ | | +| fps | 2043 | +| iterations | 46 | +| time_elapsed | 368 | +| total_timesteps | 753664 | +------------------------------- +----------------------------------------- +| time/ | | +| fps | 2062 | +| iterations | 47 | +| time_elapsed | 373 | +| total_timesteps | 770048 | +| train/ | | +| approx_kl | 0.008165602 | +| clip_fraction | 0.0997 | +| clip_range | 0.2 | +| entropy_loss | -2.69 | +| explained_variance | 0.931 | +| learning_rate | 0.0003 | +| loss | -0.0461 | +| n_updates | 460 | +| policy_gradient_loss | -0.00412 | +| std | 0.932 | +| value_loss | 0.00308 | +----------------------------------------- +----------------------------------------- +| time/ | | +| fps | 2074 | +| iterations | 48 | +| time_elapsed | 379 | +| total_timesteps | 786432 | +| train/ | | +| approx_kl | 0.006088208 | +| clip_fraction | 0.0805 | +| clip_range | 0.2 | +| entropy_loss | -2.71 | +| explained_variance | 0.917 | +| learning_rate | 0.0003 | +| loss | -0.034 | +| n_updates | 470 | +| policy_gradient_loss | -0.000257 | +| std | 0.943 | +| value_loss | 0.00533 | +----------------------------------------- +Eval num_timesteps=800000, episode_reward=-32.78 +/- 23.33 +Episode length: 2000.00 +/- 0.00 +------------------------------------------ +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -32.8 | +| time/ | | +| total_timesteps | 800000 | +| train/ | | +| approx_kl | 0.0069386996 | +| clip_fraction | 0.0883 | +| clip_range | 0.2 | +| entropy_loss | -2.73 | +| explained_variance | 0.954 | +| learning_rate | 0.0003 | +| loss | -0.0361 | +| n_updates | 480 | +| policy_gradient_loss | -0.00228 | +| std | 0.948 | +| value_loss | 0.00495 | +------------------------------------------ +------------------------------- +| time/ | | +| fps | 2028 | +| iterations | 49 | +| time_elapsed | 395 | +| total_timesteps | 802816 | +------------------------------- +------------------------------------------ +| time/ | | +| fps | 2045 | +| iterations | 50 | +| time_elapsed | 400 | +| total_timesteps | 819200 | +| train/ | | +| approx_kl | 0.0070893797 | +| clip_fraction | 0.0687 | +| clip_range | 0.2 | +| entropy_loss | -2.74 | +| explained_variance | 0.955 | +| learning_rate | 0.0003 | +| loss | -0.035 | +| n_updates | 490 | +| policy_gradient_loss | -0.00221 | +| std | 0.954 | +| value_loss | 0.00229 | +------------------------------------------ +------------------------------------------ +| time/ | | +| fps | 2060 | +| iterations | 51 | +| time_elapsed | 405 | +| total_timesteps | 835584 | +| train/ | | +| approx_kl | 0.0068652867 | +| clip_fraction | 0.0787 | +| clip_range | 0.2 | +| entropy_loss | -2.75 | +| explained_variance | 0.863 | +| learning_rate | 0.0003 | +| loss | -0.0337 | +| n_updates | 500 | +| policy_gradient_loss | -0.00277 | +| std | 0.959 | +| value_loss | 0.00229 | +------------------------------------------ +Eval num_timesteps=850000, episode_reward=-14.34 +/- 48.77 +Episode length: 1998.40 +/- 6.97 +----------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -14.3 | +| time/ | | +| total_timesteps | 850000 | +| train/ | | +| approx_kl | 0.007872021 | +| clip_fraction | 0.0815 | +| clip_range | 0.2 | +| entropy_loss | -2.76 | +| explained_variance | 0.852 | +| learning_rate | 0.0003 | +| loss | -0.0358 | +| n_updates | 510 | +| policy_gradient_loss | -0.00365 | +| std | 0.966 | +| value_loss | 0.00272 | +----------------------------------------- +------------------------------- +| time/ | | +| fps | 2018 | +| iterations | 52 | +| time_elapsed | 422 | +| total_timesteps | 851968 | +------------------------------- +----------------------------------------- +| time/ | | +| fps | 2032 | +| iterations | 53 | +| time_elapsed | 427 | +| total_timesteps | 868352 | +| train/ | | +| approx_kl | 0.007002457 | +| clip_fraction | 0.0752 | +| clip_range | 0.2 | +| entropy_loss | -2.78 | +| explained_variance | 0.879 | +| learning_rate | 0.0003 | +| loss | -0.0414 | +| n_updates | 520 | +| policy_gradient_loss | -0.00242 | +| std | 0.977 | +| value_loss | 0.00166 | +----------------------------------------- +----------------------------------------- +| time/ | | +| fps | 2047 | +| iterations | 54 | +| time_elapsed | 432 | +| total_timesteps | 884736 | +| train/ | | +| approx_kl | 0.007822147 | +| clip_fraction | 0.0813 | +| clip_range | 0.2 | +| entropy_loss | -2.8 | +| explained_variance | 0.871 | +| learning_rate | 0.0003 | +| loss | -0.0376 | +| n_updates | 530 | +| policy_gradient_loss | -0.00362 | +| std | 0.984 | +| value_loss | 0.00212 | +----------------------------------------- +Eval num_timesteps=900000, episode_reward=-20.41 +/- 60.01 +Episode length: 1929.40 +/- 284.99 +---------------------------------------- +| eval/ | | +| mean_ep_length | 1.93e+03 | +| mean_reward | -20.4 | +| time/ | | +| total_timesteps | 900000 | +| train/ | | +| approx_kl | 0.00738756 | +| clip_fraction | 0.0793 | +| clip_range | 0.2 | +| entropy_loss | -2.81 | +| explained_variance | 0.808 | +| learning_rate | 0.0003 | +| loss | -0.0355 | +| n_updates | 540 | +| policy_gradient_loss | -0.00195 | +| std | 0.988 | +| value_loss | 0.00721 | +---------------------------------------- + +[Diag @ 900,000 | n_sheep=2 | success=5%] + COMPACT_CANT_DRIVE 11/20 + NEVER_COMPACT 8/20 + SUCCESS 1/20 + action_mag mean=0.203 p10=0.007 p90=0.704 (0=stopped, 1=full speed) + min_flock_radius mean=3.40m best=0.00m (target <5m to compact) + min_dog_to_com mean=0.60m best=0.11m (FLEE_DIST=7m) + min_com_to_pen mean=14.01m best=3.61m + reward/step (mean): progress=-0.0040 alignment=+0.0071 pen_bonus=+0.0008 step_cost=-0.0200 complete=+0.0026 +------------------------------- +| time/ | | +| fps | 1977 | +| iterations | 55 | +| time_elapsed | 455 | +| total_timesteps | 901120 | +------------------------------- +----------------------------------------- +| time/ | | +| fps | 1990 | +| iterations | 56 | +| time_elapsed | 460 | +| total_timesteps | 917504 | +| train/ | | +| approx_kl | 0.007000256 | +| clip_fraction | 0.0831 | +| clip_range | 0.2 | +| entropy_loss | -2.8 | +| explained_variance | 0.889 | +| learning_rate | 0.0003 | +| loss | -0.0285 | +| n_updates | 550 | +| policy_gradient_loss | -0.00402 | +| std | 0.984 | +| value_loss | 0.00171 | +----------------------------------------- +----------------------------------------- +| time/ | | +| fps | 2005 | +| iterations | 57 | +| time_elapsed | 465 | +| total_timesteps | 933888 | +| train/ | | +| approx_kl | 0.007749311 | +| clip_fraction | 0.0755 | +| clip_range | 0.2 | +| entropy_loss | -2.83 | +| explained_variance | 0.599 | +| learning_rate | 0.0003 | +| loss | -0.032 | +| n_updates | 560 | +| policy_gradient_loss | -0.00239 | +| std | 1.01 | +| value_loss | 0.00351 | +----------------------------------------- +Eval num_timesteps=950000, episode_reward=-13.16 +/- 44.70 +Episode length: 1949.30 +/- 221.00 +------------------------------------------ +| eval/ | | +| mean_ep_length | 1.95e+03 | +| mean_reward | -13.2 | +| time/ | | +| total_timesteps | 950000 | +| train/ | | +| approx_kl | 0.0075328955 | +| clip_fraction | 0.0829 | +| clip_range | 0.2 | +| entropy_loss | -2.85 | +| explained_variance | 0.783 | +| learning_rate | 0.0003 | +| loss | -0.0306 | +| n_updates | 570 | +| policy_gradient_loss | -0.00352 | +| std | 1.01 | +| value_loss | 0.00319 | +------------------------------------------ +------------------------------- +| time/ | | +| fps | 1971 | +| iterations | 58 | +| time_elapsed | 482 | +| total_timesteps | 950272 | +------------------------------- +------------------------------------------ +| time/ | | +| fps | 1981 | +| iterations | 59 | +| time_elapsed | 487 | +| total_timesteps | 966656 | +| train/ | | +| approx_kl | 0.0072506005 | +| clip_fraction | 0.0835 | +| clip_range | 0.2 | +| entropy_loss | -2.86 | +| explained_variance | 0.929 | +| learning_rate | 0.0003 | +| loss | -0.0291 | +| n_updates | 580 | +| policy_gradient_loss | -0.00173 | +| std | 1.01 | +| value_loss | 0.00491 | +------------------------------------------ +------------------------------------------ +| time/ | | +| fps | 1991 | +| iterations | 60 | +| time_elapsed | 493 | +| total_timesteps | 983040 | +| train/ | | +| approx_kl | 0.0068104668 | +| clip_fraction | 0.0799 | +| clip_range | 0.2 | +| entropy_loss | -2.87 | +| explained_variance | 0.813 | +| learning_rate | 0.0003 | +| loss | -0.0282 | +| n_updates | 590 | +| policy_gradient_loss | -0.00162 | +| std | 1.02 | +| value_loss | 0.00477 | +------------------------------------------ +----------------------------------------- +| time/ | | +| fps | 2005 | +| iterations | 61 | +| time_elapsed | 498 | +| total_timesteps | 999424 | +| train/ | | +| approx_kl | 0.007103944 | +| clip_fraction | 0.0774 | +| clip_range | 0.2 | +| entropy_loss | -2.88 | +| explained_variance | 0.942 | +| learning_rate | 0.0003 | +| loss | -0.0322 | +| n_updates | 600 | +| policy_gradient_loss | -0.00143 | +| std | 1.03 | +| value_loss | 0.0033 | +----------------------------------------- +Eval num_timesteps=1000000, episode_reward=-25.58 +/- 49.00 +Episode length: 1999.50 +/- 2.18 +------------------------------------------ +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -25.6 | +| time/ | | +| total_timesteps | 1000000 | +| train/ | | +| approx_kl | 0.0075788023 | +| clip_fraction | 0.088 | +| clip_range | 0.2 | +| entropy_loss | -2.9 | +| explained_variance | 0.864 | +| learning_rate | 0.0003 | +| loss | -0.0352 | +| n_updates | 610 | +| policy_gradient_loss | -0.003 | +| std | 1.04 | +| value_loss | 0.00192 | +------------------------------------------ +-------------------------------- +| time/ | | +| fps | 1971 | +| iterations | 62 | +| time_elapsed | 515 | +| total_timesteps | 1015808 | +-------------------------------- +----------------------------------------- +| time/ | | +| fps | 1983 | +| iterations | 63 | +| time_elapsed | 520 | +| total_timesteps | 1032192 | +| train/ | | +| approx_kl | 0.009131588 | +| clip_fraction | 0.0902 | +| clip_range | 0.2 | +| entropy_loss | -2.89 | +| explained_variance | 0.941 | +| learning_rate | 0.0003 | +| loss | -0.0476 | +| n_updates | 620 | +| policy_gradient_loss | -0.00341 | +| std | 1.03 | +| value_loss | 0.00705 | +----------------------------------------- +---------------------------------------- +| time/ | | +| fps | 1995 | +| iterations | 64 | +| time_elapsed | 525 | +| total_timesteps | 1048576 | +| train/ | | +| approx_kl | 0.00746674 | +| clip_fraction | 0.0838 | +| clip_range | 0.2 | +| entropy_loss | -2.89 | +| explained_variance | 0.958 | +| learning_rate | 0.0003 | +| loss | -0.022 | +| n_updates | 630 | +| policy_gradient_loss | -0.00392 | +| std | 1.03 | +| value_loss | 0.00592 | +---------------------------------------- +Eval num_timesteps=1050000, episode_reward=-12.04 +/- 64.56 +Episode length: 1889.90 +/- 333.38 +------------------------------------------ +| eval/ | | +| mean_ep_length | 1.89e+03 | +| mean_reward | -12 | +| time/ | | +| total_timesteps | 1050000 | +| train/ | | +| approx_kl | 0.0058071706 | +| clip_fraction | 0.0721 | +| clip_range | 0.2 | +| entropy_loss | -2.9 | +| explained_variance | 0.932 | +| learning_rate | 0.0003 | +| loss | -0.0188 | +| n_updates | 640 | +| policy_gradient_loss | -0.00235 | +| std | 1.03 | +| value_loss | 0.00513 | +------------------------------------------ + +[Diag @ 1,050,000 | n_sheep=2 | success=5%] + COMPACT_CANT_DRIVE 10/20 + NEVER_COMPACT 9/20 + SUCCESS 1/20 + action_mag mean=0.190 p10=0.001 p90=0.686 (0=stopped, 1=full speed) + min_flock_radius mean=4.60m best=0.00m (target <5m to compact) + min_dog_to_com mean=0.54m best=0.21m (FLEE_DIST=7m) + min_com_to_pen mean=13.05m best=3.62m + reward/step (mean): progress=-0.0023 alignment=+0.0072 pen_bonus=+0.0005 step_cost=-0.0200 complete=+0.0025 +-------------------------------- +| time/ | | +| fps | 1931 | +| iterations | 65 | +| time_elapsed | 551 | +| total_timesteps | 1064960 | +-------------------------------- +----------------------------------------- +| time/ | | +| fps | 1944 | +| iterations | 66 | +| time_elapsed | 556 | +| total_timesteps | 1081344 | +| train/ | | +| approx_kl | 0.006802067 | +| clip_fraction | 0.0701 | +| clip_range | 0.2 | +| entropy_loss | -2.92 | +| explained_variance | 0.937 | +| learning_rate | 0.0003 | +| loss | -0.0304 | +| n_updates | 650 | +| policy_gradient_loss | -0.0019 | +| std | 1.04 | +| value_loss | 0.00206 | +----------------------------------------- +----------------------------------------- +| time/ | | +| fps | 1956 | +| iterations | 67 | +| time_elapsed | 561 | +| total_timesteps | 1097728 | +| train/ | | +| approx_kl | 0.007102525 | +| clip_fraction | 0.074 | +| clip_range | 0.2 | +| entropy_loss | -2.92 | +| explained_variance | 0.953 | +| learning_rate | 0.0003 | +| loss | -0.00869 | +| n_updates | 660 | +| policy_gradient_loss | -0.00208 | +| std | 1.04 | +| value_loss | 0.00579 | +----------------------------------------- +Eval num_timesteps=1100000, episode_reward=-29.51 +/- 23.80 +Episode length: 2000.00 +/- 0.00 +----------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -29.5 | +| time/ | | +| total_timesteps | 1100000 | +| train/ | | +| approx_kl | 0.006372301 | +| clip_fraction | 0.0669 | +| clip_range | 0.2 | +| entropy_loss | -2.94 | +| explained_variance | 0.829 | +| learning_rate | 0.0003 | +| loss | -0.0349 | +| n_updates | 670 | +| policy_gradient_loss | -0.00135 | +| std | 1.06 | +| value_loss | 0.00208 | +----------------------------------------- +-------------------------------- +| time/ | | +| fps | 1932 | +| iterations | 68 | +| time_elapsed | 576 | +| total_timesteps | 1114112 | +-------------------------------- +----------------------------------------- +| time/ | | +| fps | 1942 | +| iterations | 69 | +| time_elapsed | 581 | +| total_timesteps | 1130496 | +| train/ | | +| approx_kl | 0.007083354 | +| clip_fraction | 0.0839 | +| clip_range | 0.2 | +| entropy_loss | -2.95 | +| explained_variance | 0.845 | +| learning_rate | 0.0003 | +| loss | -0.0464 | +| n_updates | 680 | +| policy_gradient_loss | -0.00298 | +| std | 1.06 | +| value_loss | 0.00747 | +----------------------------------------- +----------------------------------------- +| time/ | | +| fps | 1954 | +| iterations | 70 | +| time_elapsed | 586 | +| total_timesteps | 1146880 | +| train/ | | +| approx_kl | 0.007034454 | +| clip_fraction | 0.0875 | +| clip_range | 0.2 | +| entropy_loss | -2.96 | +| explained_variance | 0.892 | +| learning_rate | 0.0003 | +| loss | -0.0382 | +| n_updates | 690 | +| policy_gradient_loss | -0.00359 | +| std | 1.06 | +| value_loss | 0.00208 | +----------------------------------------- +Eval num_timesteps=1150000, episode_reward=-20.98 +/- 49.18 +Episode length: 1959.70 +/- 175.66 +----------------------------------------- +| eval/ | | +| mean_ep_length | 1.96e+03 | +| mean_reward | -21 | +| time/ | | +| total_timesteps | 1150000 | +| train/ | | +| approx_kl | 0.006192833 | +| clip_fraction | 0.0626 | +| clip_range | 0.2 | +| entropy_loss | -2.94 | +| explained_variance | 0.951 | +| learning_rate | 0.0003 | +| loss | -0.0224 | +| n_updates | 700 | +| policy_gradient_loss | -0.00299 | +| std | 1.05 | +| value_loss | 0.00883 | +----------------------------------------- +-------------------------------- +| time/ | | +| fps | 1926 | +| iterations | 71 | +| time_elapsed | 603 | +| total_timesteps | 1163264 | +-------------------------------- +----------------------------------------- +| time/ | | +| fps | 1937 | +| iterations | 72 | +| time_elapsed | 608 | +| total_timesteps | 1179648 | +| train/ | | +| approx_kl | 0.008185772 | +| clip_fraction | 0.0969 | +| clip_range | 0.2 | +| entropy_loss | -2.96 | +| explained_variance | 0.944 | +| learning_rate | 0.0003 | +| loss | -0.0278 | +| n_updates | 710 | +| policy_gradient_loss | -0.00316 | +| std | 1.07 | +| value_loss | 0.00421 | +----------------------------------------- +------------------------------------------ +| time/ | | +| fps | 1947 | +| iterations | 73 | +| time_elapsed | 614 | +| total_timesteps | 1196032 | +| train/ | | +| approx_kl | 0.0063469247 | +| clip_fraction | 0.065 | +| clip_range | 0.2 | +| entropy_loss | -2.96 | +| explained_variance | 0.912 | +| learning_rate | 0.0003 | +| loss | -0.0239 | +| n_updates | 720 | +| policy_gradient_loss | -0.00224 | +| std | 1.06 | +| value_loss | 0.0054 | +------------------------------------------ +Eval num_timesteps=1200000, episode_reward=-29.34 +/- 18.71 +Episode length: 2000.00 +/- 0.00 +---------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -29.3 | +| time/ | | +| total_timesteps | 1200000 | +| train/ | | +| approx_kl | 0.00778389 | +| clip_fraction | 0.0734 | +| clip_range | 0.2 | +| entropy_loss | -2.95 | +| explained_variance | 0.961 | +| learning_rate | 0.0003 | +| loss | -0.0435 | +| n_updates | 730 | +| policy_gradient_loss | -0.00184 | +| std | 1.06 | +| value_loss | 0.0048 | +---------------------------------------- + +[Diag @ 1,200,000 | n_sheep=2 | success=10%] + NEVER_COMPACT 9/20 + COMPACT_CANT_DRIVE 9/20 + SUCCESS 2/20 + action_mag mean=0.198 p10=0.002 p90=0.744 (0=stopped, 1=full speed) + min_flock_radius mean=3.94m best=0.00m (target <5m to compact) + min_dog_to_com mean=0.50m best=0.14m (FLEE_DIST=7m) + min_com_to_pen mean=11.36m best=3.58m + reward/step (mean): progress=-0.0002 alignment=+0.0073 pen_bonus=+0.0013 step_cost=-0.0200 complete=+0.0053 + +[Curriculum] leaving stage n_sheep=2 after 600,000 steps | training success rate (last 100 eps) = 5% +[Curriculum] → 3 sheep at step 1,200,000 + +-------------------------------- +| time/ | | +| fps | 1898 | +| iterations | 74 | +| time_elapsed | 638 | +| total_timesteps | 1212416 | +-------------------------------- +---------------------------------------- +| time/ | | +| fps | 1909 | +| iterations | 75 | +| time_elapsed | 643 | +| total_timesteps | 1228800 | +| train/ | | +| approx_kl | 0.00918101 | +| clip_fraction | 0.106 | +| clip_range | 0.2 | +| entropy_loss | -2.95 | +| explained_variance | 0.919 | +| learning_rate | 0.0003 | +| loss | -0.0112 | +| n_updates | 740 | +| policy_gradient_loss | -0.00123 | +| std | 1.06 | +| value_loss | 0.0427 | +---------------------------------------- +----------------------------------------- +| time/ | | +| fps | 1917 | +| iterations | 76 | +| time_elapsed | 649 | +| total_timesteps | 1245184 | +| train/ | | +| approx_kl | 0.010076641 | +| clip_fraction | 0.137 | +| clip_range | 0.2 | +| entropy_loss | -2.94 | +| explained_variance | 0.919 | +| learning_rate | 0.0003 | +| loss | -0.0229 | +| n_updates | 750 | +| policy_gradient_loss | -0.000617 | +| std | 1.05 | +| value_loss | 0.0222 | +----------------------------------------- +Eval num_timesteps=1250000, episode_reward=-38.73 +/- 33.85 +Episode length: 2000.00 +/- 0.00 +--------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -38.7 | +| time/ | | +| total_timesteps | 1250000 | +| train/ | | +| approx_kl | 0.0084493 | +| clip_fraction | 0.109 | +| clip_range | 0.2 | +| entropy_loss | -2.96 | +| explained_variance | 0.96 | +| learning_rate | 0.0003 | +| loss | -0.0259 | +| n_updates | 760 | +| policy_gradient_loss | -0.00168 | +| std | 1.06 | +| value_loss | 0.0024 | +--------------------------------------- +-------------------------------- +| time/ | | +| fps | 1890 | +| iterations | 77 | +| time_elapsed | 667 | +| total_timesteps | 1261568 | +-------------------------------- +----------------------------------------- +| time/ | | +| fps | 1899 | +| iterations | 78 | +| time_elapsed | 672 | +| total_timesteps | 1277952 | +| train/ | | +| approx_kl | 0.008724872 | +| clip_fraction | 0.109 | +| clip_range | 0.2 | +| entropy_loss | -2.98 | +| explained_variance | 0.931 | +| learning_rate | 0.0003 | +| loss | -0.0293 | +| n_updates | 770 | +| policy_gradient_loss | -0.00204 | +| std | 1.08 | +| value_loss | 0.0067 | +----------------------------------------- +----------------------------------------- +| time/ | | +| fps | 1906 | +| iterations | 79 | +| time_elapsed | 678 | +| total_timesteps | 1294336 | +| train/ | | +| approx_kl | 0.008191848 | +| clip_fraction | 0.096 | +| clip_range | 0.2 | +| entropy_loss | -2.99 | +| explained_variance | 0.963 | +| learning_rate | 0.0003 | +| loss | -0.0247 | +| n_updates | 780 | +| policy_gradient_loss | -0.002 | +| std | 1.08 | +| value_loss | 0.00632 | +----------------------------------------- +Eval num_timesteps=1300000, episode_reward=-26.68 +/- 27.12 +Episode length: 2000.00 +/- 0.00 +----------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -26.7 | +| time/ | | +| total_timesteps | 1300000 | +| train/ | | +| approx_kl | 0.006018152 | +| clip_fraction | 0.0869 | +| clip_range | 0.2 | +| entropy_loss | -3 | +| explained_variance | 0.96 | +| learning_rate | 0.0003 | +| loss | -0.0311 | +| n_updates | 790 | +| policy_gradient_loss | -0.00129 | +| std | 1.09 | +| value_loss | 0.00189 | +----------------------------------------- +-------------------------------- +| time/ | | +| fps | 1881 | +| iterations | 80 | +| time_elapsed | 696 | +| total_timesteps | 1310720 | +-------------------------------- +------------------------------------------ +| time/ | | +| fps | 1892 | +| iterations | 81 | +| time_elapsed | 701 | +| total_timesteps | 1327104 | +| train/ | | +| approx_kl | 0.0077671953 | +| clip_fraction | 0.082 | +| clip_range | 0.2 | +| entropy_loss | -3.01 | +| explained_variance | 0.972 | +| learning_rate | 0.0003 | +| loss | -0.0308 | +| n_updates | 800 | +| policy_gradient_loss | -0.00219 | +| std | 1.09 | +| value_loss | 0.00177 | +------------------------------------------ +----------------------------------------- +| time/ | | +| fps | 1902 | +| iterations | 82 | +| time_elapsed | 706 | +| total_timesteps | 1343488 | +| train/ | | +| approx_kl | 0.008806022 | +| clip_fraction | 0.0947 | +| clip_range | 0.2 | +| entropy_loss | -3.02 | +| explained_variance | 0.962 | +| learning_rate | 0.0003 | +| loss | -0.0426 | +| n_updates | 810 | +| policy_gradient_loss | -0.00231 | +| std | 1.1 | +| value_loss | 0.00235 | +----------------------------------------- +Eval num_timesteps=1350000, episode_reward=-24.30 +/- 32.03 +Episode length: 2000.00 +/- 0.00 +----------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -24.3 | +| time/ | | +| total_timesteps | 1350000 | +| train/ | | +| approx_kl | 0.007263833 | +| clip_fraction | 0.0797 | +| clip_range | 0.2 | +| entropy_loss | -3.03 | +| explained_variance | 0.957 | +| learning_rate | 0.0003 | +| loss | -0.0338 | +| n_updates | 820 | +| policy_gradient_loss | -0.00251 | +| std | 1.11 | +| value_loss | 0.00397 | +----------------------------------------- + +[Diag @ 1,350,000 | n_sheep=3 | success=0%] + NEVER_COMPACT 16/20 + COMPACT_CANT_DRIVE 4/20 + action_mag mean=0.058 p10=0.004 p90=0.054 (0=stopped, 1=full speed) + min_flock_radius mean=6.77m best=1.04m (target <5m to compact) + min_dog_to_com mean=0.58m best=0.28m (FLEE_DIST=7m) + min_com_to_pen mean=12.71m best=4.27m + reward/step (mean): progress=-0.0038 alignment=+0.0015 pen_bonus=+0.0005 step_cost=-0.0200 complete=+0.0000 +-------------------------------- +| time/ | | +| fps | 1859 | +| iterations | 83 | +| time_elapsed | 731 | +| total_timesteps | 1359872 | +-------------------------------- +----------------------------------------- +| time/ | | +| fps | 1870 | +| iterations | 84 | +| time_elapsed | 735 | +| total_timesteps | 1376256 | +| train/ | | +| approx_kl | 0.007816839 | +| clip_fraction | 0.0812 | +| clip_range | 0.2 | +| entropy_loss | -3.05 | +| explained_variance | 0.946 | +| learning_rate | 0.0003 | +| loss | -0.0285 | +| n_updates | 830 | +| policy_gradient_loss | -0.00277 | +| std | 1.11 | +| value_loss | 0.0018 | +----------------------------------------- +------------------------------------------ +| time/ | | +| fps | 1880 | +| iterations | 85 | +| time_elapsed | 740 | +| total_timesteps | 1392640 | +| train/ | | +| approx_kl | 0.0064534983 | +| clip_fraction | 0.0774 | +| clip_range | 0.2 | +| entropy_loss | -3.06 | +| explained_variance | 0.958 | +| learning_rate | 0.0003 | +| loss | -0.0305 | +| n_updates | 840 | +| policy_gradient_loss | -0.00158 | +| std | 1.12 | +| value_loss | 0.00988 | +------------------------------------------ +Eval num_timesteps=1400000, episode_reward=-39.10 +/- 41.08 +Episode length: 2000.00 +/- 0.00 +------------------------------------------ +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -39.1 | +| time/ | | +| total_timesteps | 1400000 | +| train/ | | +| approx_kl | 0.0069560152 | +| clip_fraction | 0.0835 | +| clip_range | 0.2 | +| entropy_loss | -3.07 | +| explained_variance | 0.96 | +| learning_rate | 0.0003 | +| loss | -0.0302 | +| n_updates | 850 | +| policy_gradient_loss | -0.00283 | +| std | 1.12 | +| value_loss | 0.00307 | +------------------------------------------ +-------------------------------- +| time/ | | +| fps | 1857 | +| iterations | 86 | +| time_elapsed | 758 | +| total_timesteps | 1409024 | +-------------------------------- +----------------------------------------- +| time/ | | +| fps | 1864 | +| iterations | 87 | +| time_elapsed | 764 | +| total_timesteps | 1425408 | +| train/ | | +| approx_kl | 0.007682803 | +| clip_fraction | 0.0931 | +| clip_range | 0.2 | +| entropy_loss | -3.09 | +| explained_variance | 0.902 | +| learning_rate | 0.0003 | +| loss | -0.0322 | +| n_updates | 860 | +| policy_gradient_loss | -0.00224 | +| std | 1.14 | +| value_loss | 0.013 | +----------------------------------------- +------------------------------------------ +| time/ | | +| fps | 1869 | +| iterations | 88 | +| time_elapsed | 771 | +| total_timesteps | 1441792 | +| train/ | | +| approx_kl | 0.0063949013 | +| clip_fraction | 0.0786 | +| clip_range | 0.2 | +| entropy_loss | -3.1 | +| explained_variance | 0.953 | +| learning_rate | 0.0003 | +| loss | -0.0401 | +| n_updates | 870 | +| policy_gradient_loss | -0.00134 | +| std | 1.14 | +| value_loss | 0.00193 | +------------------------------------------ +Eval num_timesteps=1450000, episode_reward=-28.59 +/- 25.61 +Episode length: 2000.00 +/- 0.00 +----------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -28.6 | +| time/ | | +| total_timesteps | 1450000 | +| train/ | | +| approx_kl | 0.007503539 | +| clip_fraction | 0.0774 | +| clip_range | 0.2 | +| entropy_loss | -3.13 | +| explained_variance | 0.951 | +| learning_rate | 0.0003 | +| loss | -0.0378 | +| n_updates | 880 | +| policy_gradient_loss | -0.00309 | +| std | 1.16 | +| value_loss | 0.00551 | +----------------------------------------- +-------------------------------- +| time/ | | +| fps | 1845 | +| iterations | 89 | +| time_elapsed | 789 | +| total_timesteps | 1458176 | +-------------------------------- +------------------------------------------ +| time/ | | +| fps | 1852 | +| iterations | 90 | +| time_elapsed | 796 | +| total_timesteps | 1474560 | +| train/ | | +| approx_kl | 0.0075057503 | +| clip_fraction | 0.0793 | +| clip_range | 0.2 | +| entropy_loss | -3.15 | +| explained_variance | 0.955 | +| learning_rate | 0.0003 | +| loss | -0.0439 | +| n_updates | 890 | +| policy_gradient_loss | -0.00264 | +| std | 1.17 | +| value_loss | 0.00265 | +------------------------------------------ +------------------------------------------ +| time/ | | +| fps | 1857 | +| iterations | 91 | +| time_elapsed | 802 | +| total_timesteps | 1490944 | +| train/ | | +| approx_kl | 0.0068523246 | +| clip_fraction | 0.0755 | +| clip_range | 0.2 | +| entropy_loss | -3.15 | +| explained_variance | 0.935 | +| learning_rate | 0.0003 | +| loss | -0.0282 | +| n_updates | 900 | +| policy_gradient_loss | -0.00292 | +| std | 1.17 | +| value_loss | 0.00268 | +------------------------------------------ +Eval num_timesteps=1500000, episode_reward=-40.66 +/- 25.29 +Episode length: 2000.00 +/- 0.00 +----------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -40.7 | +| time/ | | +| total_timesteps | 1500000 | +| train/ | | +| approx_kl | 0.007249858 | +| clip_fraction | 0.0857 | +| clip_range | 0.2 | +| entropy_loss | -3.15 | +| explained_variance | 0.952 | +| learning_rate | 0.0003 | +| loss | -0.0366 | +| n_updates | 910 | +| policy_gradient_loss | -0.00319 | +| std | 1.17 | +| value_loss | 0.00564 | +----------------------------------------- + +[Diag @ 1,500,000 | n_sheep=3 | success=0%] + NEVER_COMPACT 14/20 + COMPACT_CANT_DRIVE 6/20 + action_mag mean=0.050 p10=0.005 p90=0.049 (0=stopped, 1=full speed) + min_flock_radius mean=6.53m best=0.98m (target <5m to compact) + min_dog_to_com mean=0.46m best=0.06m (FLEE_DIST=7m) + min_com_to_pen mean=12.38m best=5.44m + reward/step (mean): progress=+0.0039 alignment=+0.0011 pen_bonus=+0.0005 step_cost=-0.0200 complete=+0.0000 +-------------------------------- +| time/ | | +| fps | 1819 | +| iterations | 92 | +| time_elapsed | 828 | +| total_timesteps | 1507328 | +-------------------------------- +----------------------------------------- +| time/ | | +| fps | 1828 | +| iterations | 93 | +| time_elapsed | 833 | +| total_timesteps | 1523712 | +| train/ | | +| approx_kl | 0.007471386 | +| clip_fraction | 0.0834 | +| clip_range | 0.2 | +| entropy_loss | -3.16 | +| explained_variance | 0.929 | +| learning_rate | 0.0003 | +| loss | -0.0275 | +| n_updates | 920 | +| policy_gradient_loss | -0.00192 | +| std | 1.17 | +| value_loss | 0.00791 | +----------------------------------------- +----------------------------------------- +| time/ | | +| fps | 1835 | +| iterations | 94 | +| time_elapsed | 838 | +| total_timesteps | 1540096 | +| train/ | | +| approx_kl | 0.007296456 | +| clip_fraction | 0.0765 | +| clip_range | 0.2 | +| entropy_loss | -3.17 | +| explained_variance | 0.95 | +| learning_rate | 0.0003 | +| loss | -0.0484 | +| n_updates | 930 | +| policy_gradient_loss | -0.00366 | +| std | 1.18 | +| value_loss | 0.00788 | +----------------------------------------- +Eval num_timesteps=1550000, episode_reward=-34.66 +/- 25.47 +Episode length: 2000.00 +/- 0.00 +----------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -34.7 | +| time/ | | +| total_timesteps | 1550000 | +| train/ | | +| approx_kl | 0.007654687 | +| clip_fraction | 0.095 | +| clip_range | 0.2 | +| entropy_loss | -3.18 | +| explained_variance | 0.92 | +| learning_rate | 0.0003 | +| loss | -0.0386 | +| n_updates | 940 | +| policy_gradient_loss | -0.00316 | +| std | 1.19 | +| value_loss | 0.00363 | +----------------------------------------- +-------------------------------- +| time/ | | +| fps | 1817 | +| iterations | 95 | +| time_elapsed | 856 | +| total_timesteps | 1556480 | +-------------------------------- +----------------------------------------- +| time/ | | +| fps | 1823 | +| iterations | 96 | +| time_elapsed | 862 | +| total_timesteps | 1572864 | +| train/ | | +| approx_kl | 0.007030643 | +| clip_fraction | 0.0881 | +| clip_range | 0.2 | +| entropy_loss | -3.18 | +| explained_variance | 0.944 | +| learning_rate | 0.0003 | +| loss | -0.0346 | +| n_updates | 950 | +| policy_gradient_loss | -0.00321 | +| std | 1.19 | +| value_loss | 0.00208 | +----------------------------------------- +------------------------------------------ +| time/ | | +| fps | 1828 | +| iterations | 97 | +| time_elapsed | 869 | +| total_timesteps | 1589248 | +| train/ | | +| approx_kl | 0.0071562277 | +| clip_fraction | 0.0834 | +| clip_range | 0.2 | +| entropy_loss | -3.19 | +| explained_variance | 0.955 | +| learning_rate | 0.0003 | +| loss | -0.0196 | +| n_updates | 960 | +| policy_gradient_loss | -0.00259 | +| std | 1.2 | +| value_loss | 0.00773 | +------------------------------------------ +Eval num_timesteps=1600000, episode_reward=-33.49 +/- 36.88 +Episode length: 2000.00 +/- 0.00 +------------------------------------------ +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -33.5 | +| time/ | | +| total_timesteps | 1600000 | +| train/ | | +| approx_kl | 0.0069667175 | +| clip_fraction | 0.0741 | +| clip_range | 0.2 | +| entropy_loss | -3.2 | +| explained_variance | 0.94 | +| learning_rate | 0.0003 | +| loss | -0.0313 | +| n_updates | 970 | +| policy_gradient_loss | -0.00399 | +| std | 1.2 | +| value_loss | 0.00419 | +------------------------------------------ +-------------------------------- +| time/ | | +| fps | 1810 | +| iterations | 98 | +| time_elapsed | 886 | +| total_timesteps | 1605632 | +-------------------------------- +------------------------------------------ +| time/ | | +| fps | 1819 | +| iterations | 99 | +| time_elapsed | 891 | +| total_timesteps | 1622016 | +| train/ | | +| approx_kl | 0.0061995042 | +| clip_fraction | 0.0767 | +| clip_range | 0.2 | +| entropy_loss | -3.21 | +| explained_variance | 0.968 | +| learning_rate | 0.0003 | +| loss | -0.036 | +| n_updates | 980 | +| policy_gradient_loss | -0.00289 | +| std | 1.2 | +| value_loss | 0.00241 | +------------------------------------------ +----------------------------------------- +| time/ | | +| fps | 1826 | +| iterations | 100 | +| time_elapsed | 896 | +| total_timesteps | 1638400 | +| train/ | | +| approx_kl | 0.006502889 | +| clip_fraction | 0.0714 | +| clip_range | 0.2 | +| entropy_loss | -3.22 | +| explained_variance | 0.976 | +| learning_rate | 0.0003 | +| loss | -0.0445 | +| n_updates | 990 | +| policy_gradient_loss | -0.00314 | +| std | 1.21 | +| value_loss | 0.00218 | +----------------------------------------- +Eval num_timesteps=1650000, episode_reward=-38.00 +/- 30.02 +Episode length: 2000.00 +/- 0.00 +----------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -38 | +| time/ | | +| total_timesteps | 1650000 | +| train/ | | +| approx_kl | 0.006163503 | +| clip_fraction | 0.0739 | +| clip_range | 0.2 | +| entropy_loss | -3.22 | +| explained_variance | 0.955 | +| learning_rate | 0.0003 | +| loss | -0.0391 | +| n_updates | 1000 | +| policy_gradient_loss | -0.00257 | +| std | 1.22 | +| value_loss | 0.0027 | +----------------------------------------- + +[Diag @ 1,650,000 | n_sheep=3 | success=0%] + NEVER_COMPACT 16/20 + COMPACT_CANT_DRIVE 4/20 + action_mag mean=0.054 p10=0.002 p90=0.051 (0=stopped, 1=full speed) + min_flock_radius mean=6.63m best=3.72m (target <5m to compact) + min_dog_to_com mean=0.60m best=0.09m (FLEE_DIST=7m) + min_com_to_pen mean=13.17m best=5.44m + reward/step (mean): progress=+0.0032 alignment=+0.0015 pen_bonus=+0.0005 step_cost=-0.0200 complete=+0.0000 +-------------------------------- +| time/ | | +| fps | 1793 | +| iterations | 101 | +| time_elapsed | 922 | +| total_timesteps | 1654784 | +-------------------------------- +---------------------------------------- +| time/ | | +| fps | 1800 | +| iterations | 102 | +| time_elapsed | 927 | +| total_timesteps | 1671168 | +| train/ | | +| approx_kl | 0.00634938 | +| clip_fraction | 0.073 | +| clip_range | 0.2 | +| entropy_loss | -3.23 | +| explained_variance | 0.97 | +| learning_rate | 0.0003 | +| loss | -0.0462 | +| n_updates | 1010 | +| policy_gradient_loss | -0.00394 | +| std | 1.22 | +| value_loss | 0.00334 | +---------------------------------------- +------------------------------------------ +| time/ | | +| fps | 1807 | +| iterations | 103 | +| time_elapsed | 933 | +| total_timesteps | 1687552 | +| train/ | | +| approx_kl | 0.0072235917 | +| clip_fraction | 0.0774 | +| clip_range | 0.2 | +| entropy_loss | -3.23 | +| explained_variance | 0.957 | +| learning_rate | 0.0003 | +| loss | -0.0284 | +| n_updates | 1020 | +| policy_gradient_loss | -0.00292 | +| std | 1.22 | +| value_loss | 0.00807 | +------------------------------------------ +Eval num_timesteps=1700000, episode_reward=-32.26 +/- 31.96 +Episode length: 2000.00 +/- 0.00 +------------------------------------------ +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -32.3 | +| time/ | | +| total_timesteps | 1700000 | +| train/ | | +| approx_kl | 0.0060304543 | +| clip_fraction | 0.0721 | +| clip_range | 0.2 | +| entropy_loss | -3.23 | +| explained_variance | 0.929 | +| learning_rate | 0.0003 | +| loss | -0.0427 | +| n_updates | 1030 | +| policy_gradient_loss | -0.00306 | +| std | 1.21 | +| value_loss | 0.00208 | +------------------------------------------ +-------------------------------- +| time/ | | +| fps | 1789 | +| iterations | 104 | +| time_elapsed | 952 | +| total_timesteps | 1703936 | +-------------------------------- +----------------------------------------- +| time/ | | +| fps | 1795 | +| iterations | 105 | +| time_elapsed | 958 | +| total_timesteps | 1720320 | +| train/ | | +| approx_kl | 0.006440907 | +| clip_fraction | 0.0642 | +| clip_range | 0.2 | +| entropy_loss | -3.22 | +| explained_variance | 0.947 | +| learning_rate | 0.0003 | +| loss | -0.0317 | +| n_updates | 1040 | +| policy_gradient_loss | -0.00158 | +| std | 1.21 | +| value_loss | 0.00165 | +----------------------------------------- +----------------------------------------- +| time/ | | +| fps | 1801 | +| iterations | 106 | +| time_elapsed | 963 | +| total_timesteps | 1736704 | +| train/ | | +| approx_kl | 0.006897255 | +| clip_fraction | 0.0738 | +| clip_range | 0.2 | +| entropy_loss | -3.2 | +| explained_variance | 0.939 | +| learning_rate | 0.0003 | +| loss | -0.0408 | +| n_updates | 1050 | +| policy_gradient_loss | -0.00349 | +| std | 1.19 | +| value_loss | 0.00814 | +----------------------------------------- +Eval num_timesteps=1750000, episode_reward=-40.58 +/- 28.91 +Episode length: 2000.00 +/- 0.00 +------------------------------------------ +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -40.6 | +| time/ | | +| total_timesteps | 1750000 | +| train/ | | +| approx_kl | 0.0070952754 | +| clip_fraction | 0.0742 | +| clip_range | 0.2 | +| entropy_loss | -3.19 | +| explained_variance | 0.957 | +| learning_rate | 0.0003 | +| loss | -0.0308 | +| n_updates | 1060 | +| policy_gradient_loss | -0.0037 | +| std | 1.19 | +| value_loss | 0.0191 | +------------------------------------------ +-------------------------------- +| time/ | | +| fps | 1784 | +| iterations | 107 | +| time_elapsed | 982 | +| total_timesteps | 1753088 | +-------------------------------- +----------------------------------------- +| time/ | | +| fps | 1791 | +| iterations | 108 | +| time_elapsed | 987 | +| total_timesteps | 1769472 | +| train/ | | +| approx_kl | 0.006444447 | +| clip_fraction | 0.0736 | +| clip_range | 0.2 | +| entropy_loss | -3.2 | +| explained_variance | 0.968 | +| learning_rate | 0.0003 | +| loss | -0.0362 | +| n_updates | 1070 | +| policy_gradient_loss | -0.00409 | +| std | 1.2 | +| value_loss | 0.00395 | +----------------------------------------- +----------------------------------------- +| time/ | | +| fps | 1797 | +| iterations | 109 | +| time_elapsed | 993 | +| total_timesteps | 1785856 | +| train/ | | +| approx_kl | 0.007391736 | +| clip_fraction | 0.0758 | +| clip_range | 0.2 | +| entropy_loss | -3.22 | +| explained_variance | 0.96 | +| learning_rate | 0.0003 | +| loss | -0.0341 | +| n_updates | 1080 | +| policy_gradient_loss | -0.00272 | +| std | 1.21 | +| value_loss | 0.00221 | +----------------------------------------- +Eval num_timesteps=1800000, episode_reward=-29.06 +/- 30.98 +Episode length: 2000.00 +/- 0.00 +----------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -29.1 | +| time/ | | +| total_timesteps | 1800000 | +| train/ | | +| approx_kl | 0.006899439 | +| clip_fraction | 0.0695 | +| clip_range | 0.2 | +| entropy_loss | -3.25 | +| explained_variance | 0.965 | +| learning_rate | 0.0003 | +| loss | -0.0317 | +| n_updates | 1090 | +| policy_gradient_loss | -0.00226 | +| std | 1.23 | +| value_loss | 0.00615 | +----------------------------------------- + +[Diag @ 1,800,000 | n_sheep=3 | success=0%] + NEVER_COMPACT 11/20 + COMPACT_CANT_DRIVE 9/20 + action_mag mean=0.054 p10=0.003 p90=0.057 (0=stopped, 1=full speed) + min_flock_radius mean=6.01m best=1.13m (target <5m to compact) + min_dog_to_com mean=0.51m best=0.11m (FLEE_DIST=7m) + min_com_to_pen mean=12.52m best=3.21m + reward/step (mean): progress=+0.0050 alignment=+0.0017 pen_bonus=+0.0008 step_cost=-0.0200 complete=+0.0000 + +[Curriculum] leaving stage n_sheep=3 after 600,000 steps | training success rate (last 100 eps) = 0% +[Curriculum] → 4 sheep at step 1,800,000 + +-------------------------------- +| time/ | | +| fps | 1769 | +| iterations | 110 | +| time_elapsed | 1018 | +| total_timesteps | 1802240 | +-------------------------------- +----------------------------------------- +| time/ | | +| fps | 1776 | +| iterations | 111 | +| time_elapsed | 1023 | +| total_timesteps | 1818624 | +| train/ | | +| approx_kl | 0.006710761 | +| clip_fraction | 0.0761 | +| clip_range | 0.2 | +| entropy_loss | -3.25 | +| explained_variance | 0.867 | +| learning_rate | 0.0003 | +| loss | -0.031 | +| n_updates | 1100 | +| policy_gradient_loss | -0.00311 | +| std | 1.23 | +| value_loss | 0.0186 | +----------------------------------------- +----------------------------------------- +| time/ | | +| fps | 1783 | +| iterations | 112 | +| time_elapsed | 1028 | +| total_timesteps | 1835008 | +| train/ | | +| approx_kl | 0.006202608 | +| clip_fraction | 0.0682 | +| clip_range | 0.2 | +| entropy_loss | -3.25 | +| explained_variance | 0.954 | +| learning_rate | 0.0003 | +| loss | -0.0245 | +| n_updates | 1110 | +| policy_gradient_loss | -0.00429 | +| std | 1.23 | +| value_loss | 0.00641 | +----------------------------------------- +Eval num_timesteps=1850000, episode_reward=-35.87 +/- 42.36 +Episode length: 2000.00 +/- 0.00 +----------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -35.9 | +| time/ | | +| total_timesteps | 1850000 | +| train/ | | +| approx_kl | 0.008398036 | +| clip_fraction | 0.086 | +| clip_range | 0.2 | +| entropy_loss | -3.28 | +| explained_variance | 0.938 | +| learning_rate | 0.0003 | +| loss | -0.0514 | +| n_updates | 1120 | +| policy_gradient_loss | -0.00497 | +| std | 1.25 | +| value_loss | 0.00614 | +----------------------------------------- +-------------------------------- +| time/ | | +| fps | 1768 | +| iterations | 113 | +| time_elapsed | 1046 | +| total_timesteps | 1851392 | +-------------------------------- +----------------------------------------- +| time/ | | +| fps | 1775 | +| iterations | 114 | +| time_elapsed | 1052 | +| total_timesteps | 1867776 | +| train/ | | +| approx_kl | 0.007641702 | +| clip_fraction | 0.0742 | +| clip_range | 0.2 | +| entropy_loss | -3.31 | +| explained_variance | 0.935 | +| learning_rate | 0.0003 | +| loss | -0.046 | +| n_updates | 1130 | +| policy_gradient_loss | -0.00349 | +| std | 1.28 | +| value_loss | 0.0228 | +----------------------------------------- +------------------------------------------ +| time/ | | +| fps | 1781 | +| iterations | 115 | +| time_elapsed | 1057 | +| total_timesteps | 1884160 | +| train/ | | +| approx_kl | 0.0073437546 | +| clip_fraction | 0.0747 | +| clip_range | 0.2 | +| entropy_loss | -3.34 | +| explained_variance | 0.928 | +| learning_rate | 0.0003 | +| loss | -0.0498 | +| n_updates | 1140 | +| policy_gradient_loss | -0.00496 | +| std | 1.29 | +| value_loss | 0.00764 | +------------------------------------------ +Eval num_timesteps=1900000, episode_reward=-41.88 +/- 27.01 +Episode length: 2000.00 +/- 0.00 +----------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -41.9 | +| time/ | | +| total_timesteps | 1900000 | +| train/ | | +| approx_kl | 0.006885264 | +| clip_fraction | 0.0728 | +| clip_range | 0.2 | +| entropy_loss | -3.36 | +| explained_variance | 0.934 | +| learning_rate | 0.0003 | +| loss | -0.0503 | +| n_updates | 1150 | +| policy_gradient_loss | -0.00384 | +| std | 1.3 | +| value_loss | 0.00423 | +----------------------------------------- +-------------------------------- +| time/ | | +| fps | 1767 | +| iterations | 116 | +| time_elapsed | 1075 | +| total_timesteps | 1900544 | +-------------------------------- +------------------------------------------ +| time/ | | +| fps | 1773 | +| iterations | 117 | +| time_elapsed | 1080 | +| total_timesteps | 1916928 | +| train/ | | +| approx_kl | 0.0077611385 | +| clip_fraction | 0.0792 | +| clip_range | 0.2 | +| entropy_loss | -3.38 | +| explained_variance | 0.931 | +| learning_rate | 0.0003 | +| loss | -0.0374 | +| n_updates | 1160 | +| policy_gradient_loss | -0.00399 | +| std | 1.31 | +| value_loss | 0.00292 | +------------------------------------------ +----------------------------------------- +| time/ | | +| fps | 1780 | +| iterations | 118 | +| time_elapsed | 1085 | +| total_timesteps | 1933312 | +| train/ | | +| approx_kl | 0.006831214 | +| clip_fraction | 0.0758 | +| clip_range | 0.2 | +| entropy_loss | -3.4 | +| explained_variance | 0.963 | +| learning_rate | 0.0003 | +| loss | -0.0175 | +| n_updates | 1170 | +| policy_gradient_loss | -0.00471 | +| std | 1.33 | +| value_loss | 0.00235 | +----------------------------------------- +----------------------------------------- +| time/ | | +| fps | 1786 | +| iterations | 119 | +| time_elapsed | 1091 | +| total_timesteps | 1949696 | +| train/ | | +| approx_kl | 0.006474304 | +| clip_fraction | 0.0666 | +| clip_range | 0.2 | +| entropy_loss | -3.43 | +| explained_variance | 0.931 | +| learning_rate | 0.0003 | +| loss | -0.0318 | +| n_updates | 1180 | +| policy_gradient_loss | -0.00285 | +| std | 1.35 | +| value_loss | 0.00699 | +----------------------------------------- +Eval num_timesteps=1950000, episode_reward=-35.80 +/- 28.95 +Episode length: 2000.00 +/- 0.00 +----------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -35.8 | +| time/ | | +| total_timesteps | 1950000 | +| train/ | | +| approx_kl | 0.008532442 | +| clip_fraction | 0.0746 | +| clip_range | 0.2 | +| entropy_loss | -3.43 | +| explained_variance | 0.958 | +| learning_rate | 0.0003 | +| loss | -0.00337 | +| n_updates | 1190 | +| policy_gradient_loss | -0.00376 | +| std | 1.34 | +| value_loss | 0.0156 | +----------------------------------------- + +[Diag @ 1,950,000 | n_sheep=4 | success=0%] + NEVER_COMPACT 19/20 + COMPACT_CANT_DRIVE 1/20 + action_mag mean=0.049 p10=0.007 p90=0.044 (0=stopped, 1=full speed) + min_flock_radius mean=8.95m best=4.96m (target <5m to compact) + min_dog_to_com mean=0.39m best=0.07m (FLEE_DIST=7m) + min_com_to_pen mean=14.18m best=9.30m + reward/step (mean): progress=-0.0121 alignment=+0.0010 pen_bonus=+0.0005 step_cost=-0.0200 complete=+0.0000 +-------------------------------- +| time/ | | +| fps | 1759 | +| iterations | 120 | +| time_elapsed | 1117 | +| total_timesteps | 1966080 | +-------------------------------- +----------------------------------------- +| time/ | | +| fps | 1766 | +| iterations | 121 | +| time_elapsed | 1122 | +| total_timesteps | 1982464 | +| train/ | | +| approx_kl | 0.006549825 | +| clip_fraction | 0.0665 | +| clip_range | 0.2 | +| entropy_loss | -3.43 | +| explained_variance | 0.966 | +| learning_rate | 0.0003 | +| loss | -0.0345 | +| n_updates | 1200 | +| policy_gradient_loss | -0.00349 | +| std | 1.34 | +| value_loss | 0.00315 | +----------------------------------------- +------------------------------------------ +| time/ | | +| fps | 1773 | +| iterations | 122 | +| time_elapsed | 1127 | +| total_timesteps | 1998848 | +| train/ | | +| approx_kl | 0.0062008686 | +| clip_fraction | 0.0699 | +| clip_range | 0.2 | +| entropy_loss | -3.44 | +| explained_variance | 0.959 | +| learning_rate | 0.0003 | +| loss | -0.0512 | +| n_updates | 1210 | +| policy_gradient_loss | -0.00291 | +| std | 1.35 | +| value_loss | 0.00544 | +------------------------------------------ +Eval num_timesteps=2000000, episode_reward=-45.28 +/- 26.78 +Episode length: 2000.00 +/- 0.00 +----------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -45.3 | +| time/ | | +| total_timesteps | 2000000 | +| train/ | | +| approx_kl | 0.006553275 | +| clip_fraction | 0.0739 | +| clip_range | 0.2 | +| entropy_loss | -3.45 | +| explained_variance | 0.924 | +| learning_rate | 0.0003 | +| loss | -0.0416 | +| n_updates | 1220 | +| policy_gradient_loss | -0.00427 | +| std | 1.36 | +| value_loss | 0.0127 | +----------------------------------------- +-------------------------------- +| time/ | | +| fps | 1761 | +| iterations | 123 | +| time_elapsed | 1144 | +| total_timesteps | 2015232 | +-------------------------------- +------------------------------------------ +| time/ | | +| fps | 1767 | +| iterations | 124 | +| time_elapsed | 1149 | +| total_timesteps | 2031616 | +| train/ | | +| approx_kl | 0.0059226304 | +| clip_fraction | 0.0653 | +| clip_range | 0.2 | +| entropy_loss | -3.46 | +| explained_variance | 0.947 | +| learning_rate | 0.0003 | +| loss | -0.025 | +| n_updates | 1230 | +| policy_gradient_loss | -0.00273 | +| std | 1.36 | +| value_loss | 0.00879 | +------------------------------------------ +------------------------------------------ +| time/ | | +| fps | 1775 | +| iterations | 125 | +| time_elapsed | 1153 | +| total_timesteps | 2048000 | +| train/ | | +| approx_kl | 0.0076779695 | +| clip_fraction | 0.0729 | +| clip_range | 0.2 | +| entropy_loss | -3.47 | +| explained_variance | 0.931 | +| learning_rate | 0.0003 | +| loss | -0.0382 | +| n_updates | 1240 | +| policy_gradient_loss | -0.00385 | +| std | 1.37 | +| value_loss | 0.00692 | +------------------------------------------ +Eval num_timesteps=2050000, episode_reward=-44.22 +/- 28.52 +Episode length: 2000.00 +/- 0.00 +------------------------------------------ +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -44.2 | +| time/ | | +| total_timesteps | 2050000 | +| train/ | | +| approx_kl | 0.0073502595 | +| clip_fraction | 0.0822 | +| clip_range | 0.2 | +| entropy_loss | -3.49 | +| explained_variance | 0.946 | +| learning_rate | 0.0003 | +| loss | -0.0342 | +| n_updates | 1250 | +| policy_gradient_loss | -0.00592 | +| std | 1.39 | +| value_loss | 0.00555 | +------------------------------------------ +-------------------------------- +| time/ | | +| fps | 1764 | +| iterations | 126 | +| time_elapsed | 1170 | +| total_timesteps | 2064384 | +-------------------------------- +----------------------------------------- +| time/ | | +| fps | 1770 | +| iterations | 127 | +| time_elapsed | 1175 | +| total_timesteps | 2080768 | +| train/ | | +| approx_kl | 0.006628736 | +| clip_fraction | 0.0767 | +| clip_range | 0.2 | +| entropy_loss | -3.51 | +| explained_variance | 0.95 | +| learning_rate | 0.0003 | +| loss | -0.035 | +| n_updates | 1260 | +| policy_gradient_loss | -0.00457 | +| std | 1.4 | +| value_loss | 0.00416 | +----------------------------------------- +------------------------------------------ +| time/ | | +| fps | 1776 | +| iterations | 128 | +| time_elapsed | 1180 | +| total_timesteps | 2097152 | +| train/ | | +| approx_kl | 0.0068027405 | +| clip_fraction | 0.0719 | +| clip_range | 0.2 | +| entropy_loss | -3.53 | +| explained_variance | 0.891 | +| learning_rate | 0.0003 | +| loss | -0.0391 | +| n_updates | 1270 | +| policy_gradient_loss | -0.00312 | +| std | 1.42 | +| value_loss | 0.00492 | +------------------------------------------ +Eval num_timesteps=2100000, episode_reward=-39.37 +/- 34.76 +Episode length: 2000.00 +/- 0.00 +----------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -39.4 | +| time/ | | +| total_timesteps | 2100000 | +| train/ | | +| approx_kl | 0.005523986 | +| clip_fraction | 0.0604 | +| clip_range | 0.2 | +| entropy_loss | -3.54 | +| explained_variance | 0.938 | +| learning_rate | 0.0003 | +| loss | -0.0364 | +| n_updates | 1280 | +| policy_gradient_loss | -0.00281 | +| std | 1.42 | +| value_loss | 0.015 | +----------------------------------------- + +[Diag @ 2,100,000 | n_sheep=4 | success=0%] + NEVER_COMPACT 20/20 + action_mag mean=0.047 p10=0.002 p90=0.041 (0=stopped, 1=full speed) + min_flock_radius mean=8.62m best=5.89m (target <5m to compact) + min_dog_to_com mean=0.46m best=0.04m (FLEE_DIST=7m) + min_com_to_pen mean=14.19m best=7.53m + reward/step (mean): progress=-0.0012 alignment=+0.0012 pen_bonus=+0.0010 step_cost=-0.0200 complete=+0.0000 +-------------------------------- +| time/ | | +| fps | 1751 | +| iterations | 129 | +| time_elapsed | 1206 | +| total_timesteps | 2113536 | +-------------------------------- +----------------------------------------- +| time/ | | +| fps | 1756 | +| iterations | 130 | +| time_elapsed | 1212 | +| total_timesteps | 2129920 | +| train/ | | +| approx_kl | 0.007766474 | +| clip_fraction | 0.0823 | +| clip_range | 0.2 | +| entropy_loss | -3.53 | +| explained_variance | 0.96 | +| learning_rate | 0.0003 | +| loss | -0.0396 | +| n_updates | 1290 | +| policy_gradient_loss | -0.00492 | +| std | 1.41 | +| value_loss | 0.00554 | +----------------------------------------- +----------------------------------------- +| time/ | | +| fps | 1762 | +| iterations | 131 | +| time_elapsed | 1217 | +| total_timesteps | 2146304 | +| train/ | | +| approx_kl | 0.006704482 | +| clip_fraction | 0.0748 | +| clip_range | 0.2 | +| entropy_loss | -3.53 | +| explained_variance | 0.97 | +| learning_rate | 0.0003 | +| loss | -0.0466 | +| n_updates | 1300 | +| policy_gradient_loss | -0.00339 | +| std | 1.42 | +| value_loss | 0.00432 | +----------------------------------------- +Eval num_timesteps=2150000, episode_reward=-43.17 +/- 26.95 +Episode length: 2000.00 +/- 0.00 +------------------------------------------ +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -43.2 | +| time/ | | +| total_timesteps | 2150000 | +| train/ | | +| approx_kl | 0.0065447316 | +| clip_fraction | 0.0751 | +| clip_range | 0.2 | +| entropy_loss | -3.53 | +| explained_variance | 0.888 | +| learning_rate | 0.0003 | +| loss | -0.0369 | +| n_updates | 1310 | +| policy_gradient_loss | -0.00369 | +| std | 1.41 | +| value_loss | 0.0165 | +------------------------------------------ +-------------------------------- +| time/ | | +| fps | 1750 | +| iterations | 132 | +| time_elapsed | 1235 | +| total_timesteps | 2162688 | +-------------------------------- +------------------------------------------ +| time/ | | +| fps | 1755 | +| iterations | 133 | +| time_elapsed | 1241 | +| total_timesteps | 2179072 | +| train/ | | +| approx_kl | 0.0070872563 | +| clip_fraction | 0.075 | +| clip_range | 0.2 | +| entropy_loss | -3.54 | +| explained_variance | 0.954 | +| learning_rate | 0.0003 | +| loss | -0.0427 | +| n_updates | 1320 | +| policy_gradient_loss | -0.00406 | +| std | 1.42 | +| value_loss | 0.00977 | +------------------------------------------ +------------------------------------------ +| time/ | | +| fps | 1762 | +| iterations | 134 | +| time_elapsed | 1245 | +| total_timesteps | 2195456 | +| train/ | | +| approx_kl | 0.0073371828 | +| clip_fraction | 0.077 | +| clip_range | 0.2 | +| entropy_loss | -3.55 | +| explained_variance | 0.939 | +| learning_rate | 0.0003 | +| loss | -0.0303 | +| n_updates | 1330 | +| policy_gradient_loss | -0.00371 | +| std | 1.43 | +| value_loss | 0.00862 | +------------------------------------------ +Eval num_timesteps=2200000, episode_reward=-40.81 +/- 44.39 +Episode length: 2000.00 +/- 0.00 +------------------------------------------ +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -40.8 | +| time/ | | +| total_timesteps | 2200000 | +| train/ | | +| approx_kl | 0.0072064474 | +| clip_fraction | 0.0714 | +| clip_range | 0.2 | +| entropy_loss | -3.58 | +| explained_variance | 0.951 | +| learning_rate | 0.0003 | +| loss | -0.0517 | +| n_updates | 1340 | +| policy_gradient_loss | -0.00405 | +| std | 1.45 | +| value_loss | 0.00351 | +------------------------------------------ +-------------------------------- +| time/ | | +| fps | 1751 | +| iterations | 135 | +| time_elapsed | 1262 | +| total_timesteps | 2211840 | +-------------------------------- +----------------------------------------- +| time/ | | +| fps | 1758 | +| iterations | 136 | +| time_elapsed | 1267 | +| total_timesteps | 2228224 | +| train/ | | +| approx_kl | 0.008551812 | +| clip_fraction | 0.0911 | +| clip_range | 0.2 | +| entropy_loss | -3.58 | +| explained_variance | 0.929 | +| learning_rate | 0.0003 | +| loss | -0.0258 | +| n_updates | 1350 | +| policy_gradient_loss | -0.00599 | +| std | 1.45 | +| value_loss | 0.0034 | +----------------------------------------- +----------------------------------------- +| time/ | | +| fps | 1764 | +| iterations | 137 | +| time_elapsed | 1271 | +| total_timesteps | 2244608 | +| train/ | | +| approx_kl | 0.006960677 | +| clip_fraction | 0.0702 | +| clip_range | 0.2 | +| entropy_loss | -3.59 | +| explained_variance | 0.9 | +| learning_rate | 0.0003 | +| loss | -0.0396 | +| n_updates | 1360 | +| policy_gradient_loss | -0.00412 | +| std | 1.46 | +| value_loss | 0.00429 | +----------------------------------------- +Eval num_timesteps=2250000, episode_reward=-37.92 +/- 31.68 +Episode length: 2000.00 +/- 0.00 +----------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -37.9 | +| time/ | | +| total_timesteps | 2250000 | +| train/ | | +| approx_kl | 0.005949891 | +| clip_fraction | 0.0683 | +| clip_range | 0.2 | +| entropy_loss | -3.59 | +| explained_variance | 0.948 | +| learning_rate | 0.0003 | +| loss | -0.0381 | +| n_updates | 1370 | +| policy_gradient_loss | -0.00328 | +| std | 1.46 | +| value_loss | 0.0113 | +----------------------------------------- + +[Diag @ 2,250,000 | n_sheep=4 | success=0%] + NEVER_COMPACT 19/20 + COMPACT_CANT_DRIVE 1/20 + action_mag mean=0.068 p10=0.004 p90=0.045 (0=stopped, 1=full speed) + min_flock_radius mean=7.87m best=3.57m (target <5m to compact) + min_dog_to_com mean=0.45m best=0.15m (FLEE_DIST=7m) + min_com_to_pen mean=14.06m best=6.95m + reward/step (mean): progress=-0.0035 alignment=+0.0020 pen_bonus=+0.0008 step_cost=-0.0200 complete=+0.0000 +-------------------------------- +| time/ | | +| fps | 1743 | +| iterations | 138 | +| time_elapsed | 1297 | +| total_timesteps | 2260992 | +-------------------------------- +------------------------------------------ +| time/ | | +| fps | 1749 | +| iterations | 139 | +| time_elapsed | 1301 | +| total_timesteps | 2277376 | +| train/ | | +| approx_kl | 0.0071727796 | +| clip_fraction | 0.0784 | +| clip_range | 0.2 | +| entropy_loss | -3.6 | +| explained_variance | 0.943 | +| learning_rate | 0.0003 | +| loss | -0.0387 | +| n_updates | 1380 | +| policy_gradient_loss | -0.0042 | +| std | 1.46 | +| value_loss | 0.0113 | +------------------------------------------ +----------------------------------------- +| time/ | | +| fps | 1755 | +| iterations | 140 | +| time_elapsed | 1306 | +| total_timesteps | 2293760 | +| train/ | | +| approx_kl | 0.006800391 | +| clip_fraction | 0.0662 | +| clip_range | 0.2 | +| entropy_loss | -3.59 | +| explained_variance | 0.931 | +| learning_rate | 0.0003 | +| loss | -0.0283 | +| n_updates | 1390 | +| policy_gradient_loss | -0.00421 | +| std | 1.46 | +| value_loss | 0.00659 | +----------------------------------------- +Eval num_timesteps=2300000, episode_reward=-47.47 +/- 37.24 +Episode length: 2000.00 +/- 0.00 +----------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -47.5 | +| time/ | | +| total_timesteps | 2300000 | +| train/ | | +| approx_kl | 0.008103053 | +| clip_fraction | 0.081 | +| clip_range | 0.2 | +| entropy_loss | -3.59 | +| explained_variance | 0.945 | +| learning_rate | 0.0003 | +| loss | -0.0433 | +| n_updates | 1400 | +| policy_gradient_loss | -0.00404 | +| std | 1.46 | +| value_loss | 0.00796 | +----------------------------------------- +-------------------------------- +| time/ | | +| fps | 1745 | +| iterations | 141 | +| time_elapsed | 1323 | +| total_timesteps | 2310144 | +-------------------------------- +------------------------------------------ +| time/ | | +| fps | 1751 | +| iterations | 142 | +| time_elapsed | 1328 | +| total_timesteps | 2326528 | +| train/ | | +| approx_kl | 0.0061590094 | +| clip_fraction | 0.066 | +| clip_range | 0.2 | +| entropy_loss | -3.61 | +| explained_variance | 0.957 | +| learning_rate | 0.0003 | +| loss | -0.0436 | +| n_updates | 1410 | +| policy_gradient_loss | -0.00287 | +| std | 1.47 | +| value_loss | 0.0102 | +------------------------------------------ +------------------------------------------ +| time/ | | +| fps | 1757 | +| iterations | 143 | +| time_elapsed | 1332 | +| total_timesteps | 2342912 | +| train/ | | +| approx_kl | 0.0070403973 | +| clip_fraction | 0.0733 | +| clip_range | 0.2 | +| entropy_loss | -3.62 | +| explained_variance | 0.863 | +| learning_rate | 0.0003 | +| loss | -0.0356 | +| n_updates | 1420 | +| policy_gradient_loss | -0.00525 | +| std | 1.48 | +| value_loss | 0.0103 | +------------------------------------------ +Eval num_timesteps=2350000, episode_reward=-47.95 +/- 27.60 +Episode length: 2000.00 +/- 0.00 +----------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -48 | +| time/ | | +| total_timesteps | 2350000 | +| train/ | | +| approx_kl | 0.007505033 | +| clip_fraction | 0.0729 | +| clip_range | 0.2 | +| entropy_loss | -3.64 | +| explained_variance | 0.94 | +| learning_rate | 0.0003 | +| loss | -0.0473 | +| n_updates | 1430 | +| policy_gradient_loss | -0.00385 | +| std | 1.5 | +| value_loss | 0.00449 | +----------------------------------------- +-------------------------------- +| time/ | | +| fps | 1747 | +| iterations | 144 | +| time_elapsed | 1350 | +| total_timesteps | 2359296 | +-------------------------------- +---------------------------------------- +| time/ | | +| fps | 1752 | +| iterations | 145 | +| time_elapsed | 1355 | +| total_timesteps | 2375680 | +| train/ | | +| approx_kl | 0.00724002 | +| clip_fraction | 0.0739 | +| clip_range | 0.2 | +| entropy_loss | -3.65 | +| explained_variance | 0.948 | +| learning_rate | 0.0003 | +| loss | -0.0419 | +| n_updates | 1440 | +| policy_gradient_loss | -0.00426 | +| std | 1.5 | +| value_loss | 0.00886 | +---------------------------------------- +----------------------------------------- +| time/ | | +| fps | 1758 | +| iterations | 146 | +| time_elapsed | 1360 | +| total_timesteps | 2392064 | +| train/ | | +| approx_kl | 0.007578165 | +| clip_fraction | 0.0713 | +| clip_range | 0.2 | +| entropy_loss | -3.64 | +| explained_variance | 0.859 | +| learning_rate | 0.0003 | +| loss | -0.0427 | +| n_updates | 1450 | +| policy_gradient_loss | -0.0049 | +| std | 1.49 | +| value_loss | 0.00429 | +----------------------------------------- +Eval num_timesteps=2400000, episode_reward=-47.88 +/- 34.39 +Episode length: 2000.00 +/- 0.00 +----------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -47.9 | +| time/ | | +| total_timesteps | 2400000 | +| train/ | | +| approx_kl | 0.006707498 | +| clip_fraction | 0.0692 | +| clip_range | 0.2 | +| entropy_loss | -3.65 | +| explained_variance | 0.861 | +| learning_rate | 0.0003 | +| loss | -0.0426 | +| n_updates | 1460 | +| policy_gradient_loss | -0.00411 | +| std | 1.5 | +| value_loss | 0.00639 | +----------------------------------------- + +[Diag @ 2,400,000 | n_sheep=4 | success=0%] + NEVER_COMPACT 19/20 + COMPACT_CANT_DRIVE 1/20 + action_mag mean=0.052 p10=0.005 p90=0.045 (0=stopped, 1=full speed) + min_flock_radius mean=8.79m best=3.32m (target <5m to compact) + min_dog_to_com mean=0.45m best=0.20m (FLEE_DIST=7m) + min_com_to_pen mean=13.96m best=9.02m + reward/step (mean): progress=-0.0047 alignment=+0.0013 pen_bonus=+0.0005 step_cost=-0.0200 complete=+0.0000 +-------------------------------- +| time/ | | +| fps | 1737 | +| iterations | 147 | +| time_elapsed | 1386 | +| total_timesteps | 2408448 | +-------------------------------- + +Training complete. Artefacts saved to runs/ppo_fix_check2/ diff --git a/training/runs/ppo_fix_check2/best_model/best_model.zip b/training/runs/ppo_fix_check2/best_model/best_model.zip new file mode 100644 index 0000000..b07d85b Binary files /dev/null and b/training/runs/ppo_fix_check2/best_model/best_model.zip differ diff --git a/training/runs/ppo_fix_check2/evaluations.npz b/training/runs/ppo_fix_check2/evaluations.npz new file mode 100644 index 0000000..cc6f67e Binary files /dev/null and b/training/runs/ppo_fix_check2/evaluations.npz differ diff --git a/training/runs/ppo_fix_check2/final_model.zip b/training/runs/ppo_fix_check2/final_model.zip new file mode 100644 index 0000000..ac482b3 Binary files /dev/null and b/training/runs/ppo_fix_check2/final_model.zip differ diff --git a/training/runs/ppo_fix_check2/vecnorm.pkl b/training/runs/ppo_fix_check2/vecnorm.pkl new file mode 100644 index 0000000..20a640e Binary files /dev/null and b/training/runs/ppo_fix_check2/vecnorm.pkl differ