diff --git a/training/runs/ppo_fix_check.log b/training/runs/ppo_fix_check.log new file mode 100644 index 0000000..39ace5a --- /dev/null +++ b/training/runs/ppo_fix_check.log @@ -0,0 +1,3388 @@ +Using cpu device +Logging to runs/ppo_fix_check/ppo_1 +------------------------------ +| time/ | | +| fps | 5021 | +| iterations | 1 | +| time_elapsed | 3 | +| total_timesteps | 16384 | +------------------------------ +------------------------------------------ +| time/ | | +| fps | 4241 | +| iterations | 2 | +| time_elapsed | 7 | +| total_timesteps | 32768 | +| train/ | | +| approx_kl | 0.0047510993 | +| clip_fraction | 0.0344 | +| clip_range | 0.2 | +| entropy_loss | -2.85 | +| explained_variance | 0.786 | +| learning_rate | 0.0003 | +| loss | -0.00995 | +| n_updates | 10 | +| policy_gradient_loss | -0.00156 | +| std | 1.01 | +| value_loss | 0.0657 | +------------------------------------------ +------------------------------------------ +| time/ | | +| fps | 4026 | +| iterations | 3 | +| time_elapsed | 12 | +| total_timesteps | 49152 | +| train/ | | +| approx_kl | 0.0032065492 | +| clip_fraction | 0.0328 | +| clip_range | 0.2 | +| entropy_loss | -2.88 | +| explained_variance | 0.868 | +| learning_rate | 0.0003 | +| loss | -0.0327 | +| n_updates | 20 | +| policy_gradient_loss | -0.00152 | +| std | 1.02 | +| value_loss | 0.0172 | +------------------------------------------ +/home/jalf/miniconda3/envs/tir/lib/python3.12/site-packages/stable_baselines3/common/evaluation.py:71: UserWarning: Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper. + warnings.warn( +Eval num_timesteps=50000, episode_reward=-25.33 +/- 56.30 +Episode length: 1859.00 +/- 393.69 +------------------------------------------ +| eval/ | | +| mean_ep_length | 1.86e+03 | +| mean_reward | -25.3 | +| time/ | | +| total_timesteps | 50000 | +| train/ | | +| approx_kl | 0.0038272792 | +| clip_fraction | 0.0312 | +| clip_range | 0.2 | +| entropy_loss | -2.89 | +| explained_variance | 0.891 | +| learning_rate | 0.0003 | +| loss | -0.0224 | +| n_updates | 30 | +| policy_gradient_loss | -0.0019 | +| std | 1.02 | +| value_loss | 0.0227 | +------------------------------------------ +New best mean reward! +------------------------------ +| time/ | | +| fps | 2387 | +| iterations | 4 | +| time_elapsed | 27 | +| total_timesteps | 65536 | +------------------------------ +------------------------------------------ +| time/ | | +| fps | 2563 | +| iterations | 5 | +| time_elapsed | 31 | +| total_timesteps | 81920 | +| train/ | | +| approx_kl | 0.0040233894 | +| clip_fraction | 0.0323 | +| clip_range | 0.2 | +| entropy_loss | -2.87 | +| explained_variance | 0.878 | +| learning_rate | 0.0003 | +| loss | -0.0251 | +| n_updates | 40 | +| policy_gradient_loss | -0.00247 | +| std | 1.01 | +| value_loss | 0.0169 | +------------------------------------------ +----------------------------------------- +| time/ | | +| fps | 2719 | +| iterations | 6 | +| time_elapsed | 36 | +| total_timesteps | 98304 | +| train/ | | +| approx_kl | 0.003573698 | +| clip_fraction | 0.0316 | +| clip_range | 0.2 | +| entropy_loss | -2.86 | +| explained_variance | 0.865 | +| learning_rate | 0.0003 | +| loss | -0.0219 | +| n_updates | 50 | +| policy_gradient_loss | -0.0019 | +| std | 1.01 | +| value_loss | 0.022 | +----------------------------------------- +/home/jalf/miniconda3/envs/tir/lib/python3.12/site-packages/stable_baselines3/common/evaluation.py:71: UserWarning: Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper. + warnings.warn( +Eval num_timesteps=100000, episode_reward=-29.60 +/- 36.59 +Episode length: 1939.35 +/- 264.37 +------------------------------------------ +| eval/ | | +| mean_ep_length | 1.94e+03 | +| mean_reward | -29.6 | +| time/ | | +| total_timesteps | 100000 | +| train/ | | +| approx_kl | 0.0046861977 | +| clip_fraction | 0.039 | +| clip_range | 0.2 | +| entropy_loss | -2.86 | +| explained_variance | 0.815 | +| learning_rate | 0.0003 | +| loss | -0.0257 | +| n_updates | 60 | +| policy_gradient_loss | -0.00203 | +| std | 1.01 | +| value_loss | 0.0201 | +------------------------------------------ +------------------------------- +| time/ | | +| fps | 2191 | +| iterations | 7 | +| time_elapsed | 52 | +| total_timesteps | 114688 | +------------------------------- +----------------------------------------- +| time/ | | +| fps | 2314 | +| iterations | 8 | +| time_elapsed | 56 | +| total_timesteps | 131072 | +| train/ | | +| approx_kl | 0.005258695 | +| clip_fraction | 0.0503 | +| clip_range | 0.2 | +| entropy_loss | -2.86 | +| explained_variance | 0.807 | +| learning_rate | 0.0003 | +| loss | -0.0211 | +| n_updates | 70 | +| policy_gradient_loss | -0.00398 | +| std | 1.01 | +| value_loss | 0.0164 | +----------------------------------------- +------------------------------------------ +| time/ | | +| fps | 2359 | +| iterations | 9 | +| time_elapsed | 62 | +| total_timesteps | 147456 | +| train/ | | +| approx_kl | 0.0043328116 | +| clip_fraction | 0.0332 | +| clip_range | 0.2 | +| entropy_loss | -2.86 | +| explained_variance | 0.811 | +| learning_rate | 0.0003 | +| loss | -0.0259 | +| n_updates | 80 | +| policy_gradient_loss | -0.00173 | +| std | 1.01 | +| value_loss | 0.0121 | +------------------------------------------ +Eval num_timesteps=150000, episode_reward=-33.97 +/- 37.15 +Episode length: 1954.85 +/- 196.80 +----------------------------------------- +| eval/ | | +| mean_ep_length | 1.95e+03 | +| mean_reward | -34 | +| time/ | | +| total_timesteps | 150000 | +| train/ | | +| approx_kl | 0.005169191 | +| clip_fraction | 0.0506 | +| clip_range | 0.2 | +| entropy_loss | -2.85 | +| explained_variance | 0.649 | +| learning_rate | 0.0003 | +| loss | -0.0287 | +| n_updates | 90 | +| policy_gradient_loss | -0.00384 | +| std | 1 | +| value_loss | 0.0162 | +----------------------------------------- + +[Diag @ 150,000 | n_sheep=1 | success=15%] + COMPACT_CANT_DRIVE 16/20 + SUCCESS 3/20 + DROVE_NO_SHEEP 1/20 + action_mag mean=0.239 p10=0.071 p90=0.433 (0=stopped, 1=full speed) + min_flock_radius mean=0.00m best=0.00m (target <5m to compact) + min_dog_to_com mean=4.80m best=1.70m (FLEE_DIST=7m) + min_com_to_pen mean=10.22m best=1.50m + reward/step (mean): progress=+0.0013 alignment=+0.0000 pen_bonus=+0.0008 step_cost=-0.0200 complete=+0.0078 +------------------------------- +| time/ | | +| fps | 1935 | +| iterations | 10 | +| time_elapsed | 84 | +| total_timesteps | 163840 | +------------------------------- +------------------------------------------ +| time/ | | +| fps | 2014 | +| iterations | 11 | +| time_elapsed | 89 | +| total_timesteps | 180224 | +| train/ | | +| approx_kl | 0.0039950563 | +| clip_fraction | 0.0276 | +| clip_range | 0.2 | +| entropy_loss | -2.83 | +| explained_variance | 0.623 | +| learning_rate | 0.0003 | +| loss | -0.0128 | +| n_updates | 100 | +| policy_gradient_loss | -0.00208 | +| std | 0.995 | +| value_loss | 0.0959 | +------------------------------------------ +------------------------------------------ +| time/ | | +| fps | 2093 | +| iterations | 12 | +| time_elapsed | 93 | +| total_timesteps | 196608 | +| train/ | | +| approx_kl | 0.0036244316 | +| clip_fraction | 0.0299 | +| clip_range | 0.2 | +| entropy_loss | -2.83 | +| explained_variance | 0.916 | +| learning_rate | 0.0003 | +| loss | -0.0251 | +| n_updates | 110 | +| policy_gradient_loss | -0.00229 | +| std | 0.991 | +| value_loss | 0.0118 | +------------------------------------------ +Eval num_timesteps=200000, episode_reward=-36.37 +/- 39.41 +Episode length: 1950.95 +/- 213.80 +----------------------------------------- +| eval/ | | +| mean_ep_length | 1.95e+03 | +| mean_reward | -36.4 | +| time/ | | +| total_timesteps | 200000 | +| train/ | | +| approx_kl | 0.003325508 | +| clip_fraction | 0.0223 | +| clip_range | 0.2 | +| entropy_loss | -2.83 | +| explained_variance | 0.858 | +| learning_rate | 0.0003 | +| loss | -0.0279 | +| n_updates | 120 | +| policy_gradient_loss | -0.0007 | +| std | 0.999 | +| value_loss | 0.0493 | +----------------------------------------- +------------------------------- +| time/ | | +| fps | 1964 | +| iterations | 13 | +| time_elapsed | 108 | +| total_timesteps | 212992 | +------------------------------- +----------------------------------------- +| time/ | | +| fps | 2034 | +| iterations | 14 | +| time_elapsed | 112 | +| total_timesteps | 229376 | +| train/ | | +| approx_kl | 0.004660043 | +| clip_fraction | 0.0403 | +| clip_range | 0.2 | +| entropy_loss | -2.85 | +| explained_variance | 0.719 | +| learning_rate | 0.0003 | +| loss | 0.128 | +| n_updates | 130 | +| policy_gradient_loss | -0.00265 | +| std | 1.01 | +| value_loss | 0.073 | +----------------------------------------- +---------------------------------------- +| time/ | | +| fps | 2103 | +| iterations | 15 | +| time_elapsed | 116 | +| total_timesteps | 245760 | +| train/ | | +| approx_kl | 0.00501227 | +| clip_fraction | 0.0499 | +| clip_range | 0.2 | +| entropy_loss | -2.88 | +| explained_variance | 0.847 | +| learning_rate | 0.0003 | +| loss | -0.0237 | +| n_updates | 140 | +| policy_gradient_loss | -0.00264 | +| std | 1.02 | +| value_loss | 0.0415 | +---------------------------------------- +Eval num_timesteps=250000, episode_reward=-44.92 +/- 15.63 +Episode length: 2000.00 +/- 0.00 +------------------------------------------ +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -44.9 | +| time/ | | +| total_timesteps | 250000 | +| train/ | | +| approx_kl | 0.0055294414 | +| clip_fraction | 0.06 | +| clip_range | 0.2 | +| entropy_loss | -2.89 | +| explained_variance | 0.951 | +| learning_rate | 0.0003 | +| loss | -0.0274 | +| n_updates | 150 | +| policy_gradient_loss | -0.00491 | +| std | 1.03 | +| value_loss | 0.014 | +------------------------------------------ +------------------------------- +| time/ | | +| fps | 1999 | +| iterations | 16 | +| time_elapsed | 131 | +| total_timesteps | 262144 | +------------------------------- +------------------------------------------ +| time/ | | +| fps | 2051 | +| iterations | 17 | +| time_elapsed | 135 | +| total_timesteps | 278528 | +| train/ | | +| approx_kl | 0.0051201656 | +| clip_fraction | 0.0301 | +| clip_range | 0.2 | +| entropy_loss | -2.88 | +| explained_variance | 0.941 | +| learning_rate | 0.0003 | +| loss | 0.148 | +| n_updates | 160 | +| policy_gradient_loss | -0.00199 | +| std | 1.02 | +| value_loss | 0.099 | +------------------------------------------ +----------------------------------------- +| time/ | | +| fps | 2096 | +| iterations | 18 | +| time_elapsed | 140 | +| total_timesteps | 294912 | +| train/ | | +| approx_kl | 0.004261789 | +| clip_fraction | 0.0328 | +| clip_range | 0.2 | +| entropy_loss | -2.88 | +| explained_variance | 0.942 | +| learning_rate | 0.0003 | +| loss | -0.0314 | +| n_updates | 170 | +| policy_gradient_loss | -0.00243 | +| std | 1.02 | +| value_loss | 0.0117 | +----------------------------------------- +Eval num_timesteps=300000, episode_reward=-44.79 +/- 17.68 +Episode length: 2000.00 +/- 0.00 +----------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -44.8 | +| time/ | | +| total_timesteps | 300000 | +| train/ | | +| approx_kl | 0.004783842 | +| clip_fraction | 0.0296 | +| clip_range | 0.2 | +| entropy_loss | -2.87 | +| explained_variance | 0.892 | +| learning_rate | 0.0003 | +| loss | -0.0219 | +| n_updates | 180 | +| policy_gradient_loss | -0.00159 | +| std | 1.01 | +| value_loss | 0.0497 | +----------------------------------------- + +[Diag @ 300,000 | n_sheep=1 | success=0%] + COMPACT_CANT_DRIVE 17/20 + DROVE_NO_SHEEP 3/20 + action_mag mean=0.241 p10=0.109 p90=0.389 (0=stopped, 1=full speed) + min_flock_radius mean=0.00m best=0.00m (target <5m to compact) + min_dog_to_com mean=4.77m best=2.12m (FLEE_DIST=7m) + min_com_to_pen mean=9.31m best=1.50m + reward/step (mean): progress=+0.0016 alignment=+0.0000 pen_bonus=+0.0000 step_cost=-0.0200 complete=+0.0000 +------------------------------- +| time/ | | +| fps | 1905 | +| iterations | 19 | +| time_elapsed | 163 | +| total_timesteps | 311296 | +------------------------------- +------------------------------------------ +| time/ | | +| fps | 1949 | +| iterations | 20 | +| time_elapsed | 168 | +| total_timesteps | 327680 | +| train/ | | +| approx_kl | 0.0033368056 | +| clip_fraction | 0.0258 | +| clip_range | 0.2 | +| entropy_loss | -2.87 | +| explained_variance | 0.794 | +| learning_rate | 0.0003 | +| loss | -0.0211 | +| n_updates | 190 | +| policy_gradient_loss | -0.00105 | +| std | 1.02 | +| value_loss | 0.0769 | +------------------------------------------ +------------------------------------------ +| time/ | | +| fps | 1992 | +| iterations | 21 | +| time_elapsed | 172 | +| total_timesteps | 344064 | +| train/ | | +| approx_kl | 0.0046488494 | +| clip_fraction | 0.0352 | +| clip_range | 0.2 | +| entropy_loss | -2.87 | +| explained_variance | 0.927 | +| learning_rate | 0.0003 | +| loss | -0.0274 | +| n_updates | 200 | +| policy_gradient_loss | -0.00331 | +| std | 1.02 | +| value_loss | 0.0165 | +------------------------------------------ +Eval num_timesteps=350000, episode_reward=-24.90 +/- 50.25 +Episode length: 1976.75 +/- 82.03 +------------------------------------------ +| eval/ | | +| mean_ep_length | 1.98e+03 | +| mean_reward | -24.9 | +| time/ | | +| total_timesteps | 350000 | +| train/ | | +| approx_kl | 0.0041725934 | +| clip_fraction | 0.0299 | +| clip_range | 0.2 | +| entropy_loss | -2.88 | +| explained_variance | 0.944 | +| learning_rate | 0.0003 | +| loss | -0.026 | +| n_updates | 210 | +| policy_gradient_loss | -0.0026 | +| std | 1.02 | +| value_loss | 0.00665 | +------------------------------------------ +New best mean reward! +------------------------------- +| time/ | | +| fps | 1921 | +| iterations | 22 | +| time_elapsed | 187 | +| total_timesteps | 360448 | +------------------------------- +----------------------------------------- +| time/ | | +| fps | 1963 | +| iterations | 23 | +| time_elapsed | 191 | +| total_timesteps | 376832 | +| train/ | | +| approx_kl | 0.005180447 | +| clip_fraction | 0.0532 | +| clip_range | 0.2 | +| entropy_loss | -2.87 | +| explained_variance | 0.956 | +| learning_rate | 0.0003 | +| loss | -0.0255 | +| n_updates | 220 | +| policy_gradient_loss | -0.00352 | +| std | 1.02 | +| value_loss | 0.0142 | +----------------------------------------- +----------------------------------------- +| time/ | | +| fps | 1990 | +| iterations | 24 | +| time_elapsed | 197 | +| total_timesteps | 393216 | +| train/ | | +| approx_kl | 0.004661506 | +| clip_fraction | 0.0443 | +| clip_range | 0.2 | +| entropy_loss | -2.87 | +| explained_variance | 0.967 | +| learning_rate | 0.0003 | +| loss | -0.0331 | +| n_updates | 230 | +| policy_gradient_loss | -0.00441 | +| std | 1.02 | +| value_loss | 0.0112 | +----------------------------------------- +Eval num_timesteps=400000, episode_reward=-26.04 +/- 47.69 +Episode length: 1890.85 +/- 367.20 +----------------------------------------- +| eval/ | | +| mean_ep_length | 1.89e+03 | +| mean_reward | -26 | +| time/ | | +| total_timesteps | 400000 | +| train/ | | +| approx_kl | 0.005491742 | +| clip_fraction | 0.0538 | +| clip_range | 0.2 | +| entropy_loss | -2.89 | +| explained_variance | 0.941 | +| learning_rate | 0.0003 | +| loss | -0.042 | +| n_updates | 240 | +| policy_gradient_loss | -0.00297 | +| std | 1.03 | +| value_loss | 0.00877 | +----------------------------------------- +------------------------------- +| time/ | | +| fps | 1927 | +| iterations | 25 | +| time_elapsed | 212 | +| total_timesteps | 409600 | +------------------------------- +------------------------------------------ +| time/ | | +| fps | 1966 | +| iterations | 26 | +| time_elapsed | 216 | +| total_timesteps | 425984 | +| train/ | | +| approx_kl | 0.0045445506 | +| clip_fraction | 0.0385 | +| clip_range | 0.2 | +| entropy_loss | -2.91 | +| explained_variance | 0.941 | +| learning_rate | 0.0003 | +| loss | -0.0343 | +| n_updates | 250 | +| policy_gradient_loss | -0.00307 | +| std | 1.04 | +| value_loss | 0.00818 | +------------------------------------------ +------------------------------------------ +| time/ | | +| fps | 2004 | +| iterations | 27 | +| time_elapsed | 220 | +| total_timesteps | 442368 | +| train/ | | +| approx_kl | 0.0045271795 | +| clip_fraction | 0.0373 | +| clip_range | 0.2 | +| entropy_loss | -2.94 | +| explained_variance | 0.97 | +| learning_rate | 0.0003 | +| loss | -0.0361 | +| n_updates | 260 | +| policy_gradient_loss | -0.00236 | +| std | 1.05 | +| value_loss | 0.0091 | +------------------------------------------ +Eval num_timesteps=450000, episode_reward=-24.58 +/- 48.73 +Episode length: 1907.85 +/- 276.46 +------------------------------------------ +| eval/ | | +| mean_ep_length | 1.91e+03 | +| mean_reward | -24.6 | +| time/ | | +| total_timesteps | 450000 | +| train/ | | +| approx_kl | 0.0052676853 | +| clip_fraction | 0.0498 | +| clip_range | 0.2 | +| entropy_loss | -2.96 | +| explained_variance | 0.948 | +| learning_rate | 0.0003 | +| loss | -0.0261 | +| n_updates | 270 | +| policy_gradient_loss | -0.00236 | +| std | 1.07 | +| value_loss | 0.0286 | +------------------------------------------ +New best mean reward! + +[Diag @ 450,000 | n_sheep=1 | success=5%] + COMPACT_CANT_DRIVE 18/20 + DROVE_NO_SHEEP 1/20 + SUCCESS 1/20 + action_mag mean=0.272 p10=0.139 p90=0.407 (0=stopped, 1=full speed) + min_flock_radius mean=0.00m best=0.00m (target <5m to compact) + min_dog_to_com mean=4.81m best=1.54m (FLEE_DIST=7m) + min_com_to_pen mean=12.36m best=1.96m + reward/step (mean): progress=+0.0012 alignment=+0.0000 pen_bonus=+0.0003 step_cost=-0.0200 complete=+0.0025 +------------------------------- +| time/ | | +| fps | 1893 | +| iterations | 28 | +| time_elapsed | 242 | +| total_timesteps | 458752 | +------------------------------- +----------------------------------------- +| time/ | | +| fps | 1928 | +| iterations | 29 | +| time_elapsed | 246 | +| total_timesteps | 475136 | +| train/ | | +| approx_kl | 0.004465497 | +| clip_fraction | 0.0376 | +| clip_range | 0.2 | +| entropy_loss | -2.97 | +| explained_variance | 0.948 | +| learning_rate | 0.0003 | +| loss | -0.0307 | +| n_updates | 280 | +| policy_gradient_loss | -0.00259 | +| std | 1.07 | +| value_loss | 0.0213 | +----------------------------------------- +------------------------------------------ +| time/ | | +| fps | 1961 | +| iterations | 30 | +| time_elapsed | 250 | +| total_timesteps | 491520 | +| train/ | | +| approx_kl | 0.0054338034 | +| clip_fraction | 0.0512 | +| clip_range | 0.2 | +| entropy_loss | -2.97 | +| explained_variance | 0.967 | +| learning_rate | 0.0003 | +| loss | -0.021 | +| n_updates | 290 | +| policy_gradient_loss | -0.00296 | +| std | 1.07 | +| value_loss | 0.0138 | +------------------------------------------ +Eval num_timesteps=500000, episode_reward=-44.13 +/- 20.75 +Episode length: 2000.00 +/- 0.00 +----------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -44.1 | +| time/ | | +| total_timesteps | 500000 | +| train/ | | +| approx_kl | 0.006292434 | +| clip_fraction | 0.0572 | +| clip_range | 0.2 | +| entropy_loss | -2.97 | +| explained_variance | 0.937 | +| learning_rate | 0.0003 | +| loss | -0.0398 | +| n_updates | 300 | +| policy_gradient_loss | -0.00516 | +| std | 1.07 | +| value_loss | 0.00832 | +----------------------------------------- +------------------------------- +| time/ | | +| fps | 1913 | +| iterations | 31 | +| time_elapsed | 265 | +| total_timesteps | 507904 | +------------------------------- +------------------------------------------ +| time/ | | +| fps | 1940 | +| iterations | 32 | +| time_elapsed | 270 | +| total_timesteps | 524288 | +| train/ | | +| approx_kl | 0.0063960385 | +| clip_fraction | 0.0702 | +| clip_range | 0.2 | +| entropy_loss | -2.96 | +| explained_variance | 0.942 | +| learning_rate | 0.0003 | +| loss | -0.0341 | +| n_updates | 310 | +| policy_gradient_loss | -0.00436 | +| std | 1.06 | +| value_loss | 0.0189 | +------------------------------------------ +------------------------------------------ +| time/ | | +| fps | 1968 | +| iterations | 33 | +| time_elapsed | 274 | +| total_timesteps | 540672 | +| train/ | | +| approx_kl | 0.0070166546 | +| clip_fraction | 0.0888 | +| clip_range | 0.2 | +| entropy_loss | -2.96 | +| explained_variance | 0.955 | +| learning_rate | 0.0003 | +| loss | -0.0376 | +| n_updates | 320 | +| policy_gradient_loss | -0.00631 | +| std | 1.06 | +| value_loss | 0.00861 | +------------------------------------------ +Eval num_timesteps=550000, episode_reward=-38.60 +/- 14.53 +Episode length: 2000.00 +/- 0.00 +------------------------------------------ +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -38.6 | +| time/ | | +| total_timesteps | 550000 | +| train/ | | +| approx_kl | 0.0068266992 | +| clip_fraction | 0.075 | +| clip_range | 0.2 | +| entropy_loss | -2.96 | +| explained_variance | 0.959 | +| learning_rate | 0.0003 | +| loss | -0.0252 | +| n_updates | 330 | +| policy_gradient_loss | -0.00593 | +| std | 1.07 | +| value_loss | 0.0131 | +------------------------------------------ +------------------------------- +| time/ | | +| fps | 1922 | +| iterations | 34 | +| time_elapsed | 289 | +| total_timesteps | 557056 | +------------------------------- +----------------------------------------- +| time/ | | +| fps | 1950 | +| iterations | 35 | +| time_elapsed | 294 | +| total_timesteps | 573440 | +| train/ | | +| approx_kl | 0.006152669 | +| clip_fraction | 0.0626 | +| clip_range | 0.2 | +| entropy_loss | -2.97 | +| explained_variance | 0.954 | +| learning_rate | 0.0003 | +| loss | -0.0376 | +| n_updates | 340 | +| policy_gradient_loss | -0.00514 | +| std | 1.07 | +| value_loss | 0.0187 | +----------------------------------------- +----------------------------------------- +| time/ | | +| fps | 1977 | +| iterations | 36 | +| time_elapsed | 298 | +| total_timesteps | 589824 | +| train/ | | +| approx_kl | 0.006685758 | +| clip_fraction | 0.0729 | +| clip_range | 0.2 | +| entropy_loss | -2.97 | +| explained_variance | 0.958 | +| learning_rate | 0.0003 | +| loss | -0.0387 | +| n_updates | 350 | +| policy_gradient_loss | -0.00632 | +| std | 1.07 | +| value_loss | 0.0118 | +----------------------------------------- +Eval num_timesteps=600000, episode_reward=-31.39 +/- 8.94 +Episode length: 2000.00 +/- 0.00 +----------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -31.4 | +| time/ | | +| total_timesteps | 600000 | +| train/ | | +| approx_kl | 0.008094068 | +| clip_fraction | 0.0985 | +| clip_range | 0.2 | +| entropy_loss | -2.97 | +| explained_variance | 0.937 | +| learning_rate | 0.0003 | +| loss | -0.0439 | +| n_updates | 360 | +| policy_gradient_loss | -0.00782 | +| std | 1.07 | +| value_loss | 0.0116 | +----------------------------------------- + +[Diag @ 600,000 | n_sheep=1 | success=5%] + COMPACT_CANT_DRIVE 16/20 + DROVE_NO_SHEEP 3/20 + SUCCESS 1/20 + action_mag mean=0.150 p10=0.000 p90=0.392 (0=stopped, 1=full speed) + min_flock_radius mean=0.00m best=0.00m (target <5m to compact) + min_dog_to_com mean=3.64m best=0.68m (FLEE_DIST=7m) + min_com_to_pen mean=10.60m best=1.50m + reward/step (mean): progress=+0.0025 alignment=+0.0000 pen_bonus=+0.0003 step_cost=-0.0200 complete=+0.0026 + +[Curriculum] leaving stage n_sheep=1 after 600,000 steps | training success rate (last 100 eps) = 9% +[Curriculum] → 2 sheep at step 600,000 + +------------------------------- +| time/ | | +| fps | 1894 | +| iterations | 37 | +| time_elapsed | 319 | +| total_timesteps | 606208 | +------------------------------- +------------------------------------------ +| time/ | | +| fps | 1917 | +| iterations | 38 | +| time_elapsed | 324 | +| total_timesteps | 622592 | +| train/ | | +| approx_kl | 0.0067913756 | +| clip_fraction | 0.0689 | +| clip_range | 0.2 | +| entropy_loss | -2.97 | +| explained_variance | 0.861 | +| learning_rate | 0.0003 | +| loss | 0.0772 | +| n_updates | 370 | +| policy_gradient_loss | -0.00184 | +| std | 1.07 | +| value_loss | 0.101 | +------------------------------------------ +------------------------------------------ +| time/ | | +| fps | 1938 | +| iterations | 39 | +| time_elapsed | 329 | +| total_timesteps | 638976 | +| train/ | | +| approx_kl | 0.0061344057 | +| clip_fraction | 0.0666 | +| clip_range | 0.2 | +| entropy_loss | -2.98 | +| explained_variance | 0.928 | +| learning_rate | 0.0003 | +| loss | -0.0147 | +| n_updates | 380 | +| policy_gradient_loss | -0.00148 | +| std | 1.08 | +| value_loss | 0.0386 | +------------------------------------------ +Eval num_timesteps=650000, episode_reward=-42.39 +/- 31.99 +Episode length: 2000.00 +/- 0.00 +------------------------------------------ +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -42.4 | +| time/ | | +| total_timesteps | 650000 | +| train/ | | +| approx_kl | 0.0061708866 | +| clip_fraction | 0.06 | +| clip_range | 0.2 | +| entropy_loss | -2.98 | +| explained_variance | 0.918 | +| learning_rate | 0.0003 | +| loss | -0.0203 | +| n_updates | 390 | +| policy_gradient_loss | -0.00313 | +| std | 1.07 | +| value_loss | 0.0242 | +------------------------------------------ +------------------------------- +| time/ | | +| fps | 1896 | +| iterations | 40 | +| time_elapsed | 345 | +| total_timesteps | 655360 | +------------------------------- +----------------------------------------- +| time/ | | +| fps | 1918 | +| iterations | 41 | +| time_elapsed | 350 | +| total_timesteps | 671744 | +| train/ | | +| approx_kl | 0.007122565 | +| clip_fraction | 0.0765 | +| clip_range | 0.2 | +| entropy_loss | -2.98 | +| explained_variance | 0.855 | +| learning_rate | 0.0003 | +| loss | -0.00749 | +| n_updates | 400 | +| policy_gradient_loss | -0.00529 | +| std | 1.07 | +| value_loss | 0.0596 | +----------------------------------------- +------------------------------------------ +| time/ | | +| fps | 1941 | +| iterations | 42 | +| time_elapsed | 354 | +| total_timesteps | 688128 | +| train/ | | +| approx_kl | 0.0078532845 | +| clip_fraction | 0.0975 | +| clip_range | 0.2 | +| entropy_loss | -2.98 | +| explained_variance | 0.89 | +| learning_rate | 0.0003 | +| loss | -0.0188 | +| n_updates | 410 | +| policy_gradient_loss | -0.00699 | +| std | 1.07 | +| value_loss | 0.0207 | +------------------------------------------ +Eval num_timesteps=700000, episode_reward=-39.79 +/- 29.60 +Episode length: 2000.00 +/- 0.00 +------------------------------------------ +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -39.8 | +| time/ | | +| total_timesteps | 700000 | +| train/ | | +| approx_kl | 0.0073551387 | +| clip_fraction | 0.084 | +| clip_range | 0.2 | +| entropy_loss | -2.97 | +| explained_variance | 0.824 | +| learning_rate | 0.0003 | +| loss | 0.0126 | +| n_updates | 420 | +| policy_gradient_loss | -0.0064 | +| std | 1.06 | +| value_loss | 0.0438 | +------------------------------------------ +------------------------------- +| time/ | | +| fps | 1904 | +| iterations | 43 | +| time_elapsed | 370 | +| total_timesteps | 704512 | +------------------------------- +----------------------------------------- +| time/ | | +| fps | 1922 | +| iterations | 44 | +| time_elapsed | 375 | +| total_timesteps | 720896 | +| train/ | | +| approx_kl | 0.006614036 | +| clip_fraction | 0.0611 | +| clip_range | 0.2 | +| entropy_loss | -2.95 | +| explained_variance | 0.881 | +| learning_rate | 0.0003 | +| loss | -0.0207 | +| n_updates | 430 | +| policy_gradient_loss | -0.00371 | +| std | 1.06 | +| value_loss | 0.0244 | +----------------------------------------- +------------------------------------------ +| time/ | | +| fps | 1940 | +| iterations | 45 | +| time_elapsed | 380 | +| total_timesteps | 737280 | +| train/ | | +| approx_kl | 0.0060790265 | +| clip_fraction | 0.0591 | +| clip_range | 0.2 | +| entropy_loss | -2.95 | +| explained_variance | 0.885 | +| learning_rate | 0.0003 | +| loss | -0.0284 | +| n_updates | 440 | +| policy_gradient_loss | -0.00447 | +| std | 1.06 | +| value_loss | 0.0206 | +------------------------------------------ +Eval num_timesteps=750000, episode_reward=-40.21 +/- 27.55 +Episode length: 2000.00 +/- 0.00 +------------------------------------------ +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -40.2 | +| time/ | | +| total_timesteps | 750000 | +| train/ | | +| approx_kl | 0.0066163363 | +| clip_fraction | 0.0691 | +| clip_range | 0.2 | +| entropy_loss | -2.96 | +| explained_variance | 0.924 | +| learning_rate | 0.0003 | +| loss | -0.032 | +| n_updates | 450 | +| policy_gradient_loss | -0.0043 | +| std | 1.06 | +| value_loss | 0.0127 | +------------------------------------------ + +[Diag @ 750,000 | n_sheep=2 | success=0%] + COMPACT_CANT_DRIVE 14/20 + NEVER_COMPACT 5/20 + DROVE_NO_SHEEP 1/20 + action_mag mean=0.313 p10=0.081 p90=0.638 (0=stopped, 1=full speed) + min_flock_radius mean=2.72m best=0.00m (target <5m to compact) + min_dog_to_com mean=3.96m best=0.02m (FLEE_DIST=7m) + min_com_to_pen mean=12.68m best=2.17m + reward/step (mean): progress=-0.0005 alignment=+0.0000 pen_bonus=+0.0008 step_cost=-0.0200 complete=+0.0000 +------------------------------- +| time/ | | +| fps | 1866 | +| iterations | 46 | +| time_elapsed | 403 | +| total_timesteps | 753664 | +------------------------------- +----------------------------------------- +| time/ | | +| fps | 1887 | +| iterations | 47 | +| time_elapsed | 407 | +| total_timesteps | 770048 | +| train/ | | +| approx_kl | 0.005094421 | +| clip_fraction | 0.0496 | +| clip_range | 0.2 | +| entropy_loss | -2.96 | +| explained_variance | 0.917 | +| learning_rate | 0.0003 | +| loss | -0.0237 | +| n_updates | 460 | +| policy_gradient_loss | -0.00332 | +| std | 1.06 | +| value_loss | 0.0275 | +----------------------------------------- +----------------------------------------- +| time/ | | +| fps | 1906 | +| iterations | 48 | +| time_elapsed | 412 | +| total_timesteps | 786432 | +| train/ | | +| approx_kl | 0.006302662 | +| clip_fraction | 0.0571 | +| clip_range | 0.2 | +| entropy_loss | -2.94 | +| explained_variance | 0.944 | +| learning_rate | 0.0003 | +| loss | -0.0353 | +| n_updates | 470 | +| policy_gradient_loss | -0.00424 | +| std | 1.05 | +| value_loss | 0.0201 | +----------------------------------------- +Eval num_timesteps=800000, episode_reward=-31.43 +/- 45.97 +Episode length: 1953.35 +/- 203.34 +------------------------------------------ +| eval/ | | +| mean_ep_length | 1.95e+03 | +| mean_reward | -31.4 | +| time/ | | +| total_timesteps | 800000 | +| train/ | | +| approx_kl | 0.0055750986 | +| clip_fraction | 0.0494 | +| clip_range | 0.2 | +| entropy_loss | -2.95 | +| explained_variance | 0.959 | +| learning_rate | 0.0003 | +| loss | -0.0262 | +| n_updates | 480 | +| policy_gradient_loss | -0.00386 | +| std | 1.06 | +| value_loss | 0.0218 | +------------------------------------------ +------------------------------- +| time/ | | +| fps | 1878 | +| iterations | 49 | +| time_elapsed | 427 | +| total_timesteps | 802816 | +------------------------------- +------------------------------------------ +| time/ | | +| fps | 1897 | +| iterations | 50 | +| time_elapsed | 431 | +| total_timesteps | 819200 | +| train/ | | +| approx_kl | 0.0057711033 | +| clip_fraction | 0.0568 | +| clip_range | 0.2 | +| entropy_loss | -2.95 | +| explained_variance | 0.838 | +| learning_rate | 0.0003 | +| loss | -0.0362 | +| n_updates | 490 | +| policy_gradient_loss | -0.00438 | +| std | 1.06 | +| value_loss | 0.00952 | +------------------------------------------ +------------------------------------------ +| time/ | | +| fps | 1914 | +| iterations | 51 | +| time_elapsed | 436 | +| total_timesteps | 835584 | +| train/ | | +| approx_kl | 0.0073408587 | +| clip_fraction | 0.077 | +| clip_range | 0.2 | +| entropy_loss | -2.96 | +| explained_variance | 0.931 | +| learning_rate | 0.0003 | +| loss | -0.0283 | +| n_updates | 500 | +| policy_gradient_loss | -0.00553 | +| std | 1.07 | +| value_loss | 0.0142 | +------------------------------------------ +Eval num_timesteps=850000, episode_reward=-37.98 +/- 27.04 +Episode length: 2000.00 +/- 0.00 +------------------------------------------ +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -38 | +| time/ | | +| total_timesteps | 850000 | +| train/ | | +| approx_kl | 0.0055803536 | +| clip_fraction | 0.0536 | +| clip_range | 0.2 | +| entropy_loss | -2.96 | +| explained_variance | 0.931 | +| learning_rate | 0.0003 | +| loss | -0.0338 | +| n_updates | 510 | +| policy_gradient_loss | -0.00469 | +| std | 1.06 | +| value_loss | 0.0156 | +------------------------------------------ +------------------------------- +| time/ | | +| fps | 1884 | +| iterations | 52 | +| time_elapsed | 452 | +| total_timesteps | 851968 | +------------------------------- +---------------------------------------- +| time/ | | +| fps | 1899 | +| iterations | 53 | +| time_elapsed | 457 | +| total_timesteps | 868352 | +| train/ | | +| approx_kl | 0.00585186 | +| clip_fraction | 0.0638 | +| clip_range | 0.2 | +| entropy_loss | -2.97 | +| explained_variance | 0.83 | +| learning_rate | 0.0003 | +| loss | -0.0333 | +| n_updates | 520 | +| policy_gradient_loss | -0.00395 | +| std | 1.07 | +| value_loss | 0.0322 | +---------------------------------------- +------------------------------------------ +| time/ | | +| fps | 1915 | +| iterations | 54 | +| time_elapsed | 461 | +| total_timesteps | 884736 | +| train/ | | +| approx_kl | 0.0055105407 | +| clip_fraction | 0.045 | +| clip_range | 0.2 | +| entropy_loss | -2.96 | +| explained_variance | 0.845 | +| learning_rate | 0.0003 | +| loss | -0.0283 | +| n_updates | 530 | +| policy_gradient_loss | -0.00367 | +| std | 1.06 | +| value_loss | 0.0109 | +------------------------------------------ +Eval num_timesteps=900000, episode_reward=-41.53 +/- 35.40 +Episode length: 2000.00 +/- 0.00 +------------------------------------------ +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -41.5 | +| time/ | | +| total_timesteps | 900000 | +| train/ | | +| approx_kl | 0.0064837057 | +| clip_fraction | 0.0625 | +| clip_range | 0.2 | +| entropy_loss | -2.96 | +| explained_variance | 0.909 | +| learning_rate | 0.0003 | +| loss | -0.0394 | +| n_updates | 540 | +| policy_gradient_loss | -0.00409 | +| std | 1.06 | +| value_loss | 0.0147 | +------------------------------------------ + +[Diag @ 900,000 | n_sheep=2 | success=0%] + COMPACT_CANT_DRIVE 12/20 + NEVER_COMPACT 8/20 + action_mag mean=0.276 p10=0.038 p90=0.580 (0=stopped, 1=full speed) + min_flock_radius mean=4.30m best=0.98m (target <5m to compact) + min_dog_to_com mean=3.24m best=0.24m (FLEE_DIST=7m) + min_com_to_pen mean=12.15m best=5.60m + reward/step (mean): progress=-0.0048 alignment=+0.0000 pen_bonus=+0.0000 step_cost=-0.0200 complete=+0.0000 +------------------------------- +| time/ | | +| fps | 1857 | +| iterations | 55 | +| time_elapsed | 485 | +| total_timesteps | 901120 | +------------------------------- +----------------------------------------- +| time/ | | +| fps | 1874 | +| iterations | 56 | +| time_elapsed | 489 | +| total_timesteps | 917504 | +| train/ | | +| approx_kl | 0.006582682 | +| clip_fraction | 0.0662 | +| clip_range | 0.2 | +| entropy_loss | -2.97 | +| explained_variance | 0.961 | +| learning_rate | 0.0003 | +| loss | -0.039 | +| n_updates | 550 | +| policy_gradient_loss | -0.00462 | +| std | 1.07 | +| value_loss | 0.0103 | +----------------------------------------- +------------------------------------------ +| time/ | | +| fps | 1888 | +| iterations | 57 | +| time_elapsed | 494 | +| total_timesteps | 933888 | +| train/ | | +| approx_kl | 0.0059698187 | +| clip_fraction | 0.0573 | +| clip_range | 0.2 | +| entropy_loss | -2.97 | +| explained_variance | 0.907 | +| learning_rate | 0.0003 | +| loss | -0.0291 | +| n_updates | 560 | +| policy_gradient_loss | -0.00446 | +| std | 1.07 | +| value_loss | 0.0113 | +------------------------------------------ +Eval num_timesteps=950000, episode_reward=-26.73 +/- 22.82 +Episode length: 2000.00 +/- 0.00 +----------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -26.7 | +| time/ | | +| total_timesteps | 950000 | +| train/ | | +| approx_kl | 0.006601461 | +| clip_fraction | 0.0594 | +| clip_range | 0.2 | +| entropy_loss | -2.96 | +| explained_variance | 0.872 | +| learning_rate | 0.0003 | +| loss | -0.034 | +| n_updates | 570 | +| policy_gradient_loss | -0.00455 | +| std | 1.06 | +| value_loss | 0.00901 | +----------------------------------------- +------------------------------- +| time/ | | +| fps | 1856 | +| iterations | 58 | +| time_elapsed | 511 | +| total_timesteps | 950272 | +------------------------------- +----------------------------------------- +| time/ | | +| fps | 1869 | +| iterations | 59 | +| time_elapsed | 517 | +| total_timesteps | 966656 | +| train/ | | +| approx_kl | 0.005824944 | +| clip_fraction | 0.0624 | +| clip_range | 0.2 | +| entropy_loss | -2.96 | +| explained_variance | 0.789 | +| learning_rate | 0.0003 | +| loss | -0.0214 | +| n_updates | 580 | +| policy_gradient_loss | -0.00363 | +| std | 1.07 | +| value_loss | 0.0359 | +----------------------------------------- +----------------------------------------- +| time/ | | +| fps | 1882 | +| iterations | 60 | +| time_elapsed | 522 | +| total_timesteps | 983040 | +| train/ | | +| approx_kl | 0.005888001 | +| clip_fraction | 0.0573 | +| clip_range | 0.2 | +| entropy_loss | -2.98 | +| explained_variance | 0.887 | +| learning_rate | 0.0003 | +| loss | -0.0391 | +| n_updates | 590 | +| policy_gradient_loss | -0.00371 | +| std | 1.07 | +| value_loss | 0.00935 | +----------------------------------------- +----------------------------------------- +| time/ | | +| fps | 1895 | +| iterations | 61 | +| time_elapsed | 527 | +| total_timesteps | 999424 | +| train/ | | +| approx_kl | 0.005874036 | +| clip_fraction | 0.0611 | +| clip_range | 0.2 | +| entropy_loss | -2.98 | +| explained_variance | 0.871 | +| learning_rate | 0.0003 | +| loss | -0.0246 | +| n_updates | 600 | +| policy_gradient_loss | -0.00492 | +| std | 1.07 | +| value_loss | 0.00877 | +----------------------------------------- +Eval num_timesteps=1000000, episode_reward=-22.72 +/- 33.15 +Episode length: 2000.00 +/- 0.00 +------------------------------------------ +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -22.7 | +| time/ | | +| total_timesteps | 1000000 | +| train/ | | +| approx_kl | 0.0060388125 | +| clip_fraction | 0.0637 | +| clip_range | 0.2 | +| entropy_loss | -2.97 | +| explained_variance | 0.737 | +| learning_rate | 0.0003 | +| loss | -0.0511 | +| n_updates | 610 | +| policy_gradient_loss | -0.00387 | +| std | 1.07 | +| value_loss | 0.0538 | +------------------------------------------ +New best mean reward! +-------------------------------- +| time/ | | +| fps | 1869 | +| iterations | 62 | +| time_elapsed | 543 | +| total_timesteps | 1015808 | +-------------------------------- +----------------------------------------- +| time/ | | +| fps | 1882 | +| iterations | 63 | +| time_elapsed | 548 | +| total_timesteps | 1032192 | +| train/ | | +| approx_kl | 0.007320485 | +| clip_fraction | 0.0723 | +| clip_range | 0.2 | +| entropy_loss | -2.99 | +| explained_variance | 0.946 | +| learning_rate | 0.0003 | +| loss | -0.0342 | +| n_updates | 620 | +| policy_gradient_loss | -0.0052 | +| std | 1.08 | +| value_loss | 0.0174 | +----------------------------------------- +------------------------------------------ +| time/ | | +| fps | 1894 | +| iterations | 64 | +| time_elapsed | 553 | +| total_timesteps | 1048576 | +| train/ | | +| approx_kl | 0.0066477214 | +| clip_fraction | 0.0621 | +| clip_range | 0.2 | +| entropy_loss | -3 | +| explained_variance | 0.919 | +| learning_rate | 0.0003 | +| loss | -0.0301 | +| n_updates | 630 | +| policy_gradient_loss | -0.00449 | +| std | 1.08 | +| value_loss | 0.0109 | +------------------------------------------ +Eval num_timesteps=1050000, episode_reward=-39.86 +/- 28.77 +Episode length: 2000.00 +/- 0.00 +------------------------------------------ +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -39.9 | +| time/ | | +| total_timesteps | 1050000 | +| train/ | | +| approx_kl | 0.0066243596 | +| clip_fraction | 0.0772 | +| clip_range | 0.2 | +| entropy_loss | -2.99 | +| explained_variance | 0.861 | +| learning_rate | 0.0003 | +| loss | -0.0313 | +| n_updates | 640 | +| policy_gradient_loss | -0.00462 | +| std | 1.07 | +| value_loss | 0.0324 | +------------------------------------------ + +[Diag @ 1,050,000 | n_sheep=2 | success=0%] + COMPACT_CANT_DRIVE 18/20 + NEVER_COMPACT 2/20 + action_mag mean=0.200 p10=0.022 p90=0.478 (0=stopped, 1=full speed) + min_flock_radius mean=2.29m best=0.00m (target <5m to compact) + min_dog_to_com mean=3.23m best=0.05m (FLEE_DIST=7m) + min_com_to_pen mean=12.84m best=3.77m + reward/step (mean): progress=+0.0016 alignment=+0.0000 pen_bonus=+0.0003 step_cost=-0.0200 complete=+0.0000 +-------------------------------- +| time/ | | +| fps | 1843 | +| iterations | 65 | +| time_elapsed | 577 | +| total_timesteps | 1064960 | +-------------------------------- +------------------------------------------ +| time/ | | +| fps | 1855 | +| iterations | 66 | +| time_elapsed | 582 | +| total_timesteps | 1081344 | +| train/ | | +| approx_kl | 0.0066154073 | +| clip_fraction | 0.0657 | +| clip_range | 0.2 | +| entropy_loss | -2.99 | +| explained_variance | 0.836 | +| learning_rate | 0.0003 | +| loss | -0.029 | +| n_updates | 650 | +| policy_gradient_loss | -0.0049 | +| std | 1.08 | +| value_loss | 0.0135 | +------------------------------------------ +------------------------------------------ +| time/ | | +| fps | 1865 | +| iterations | 67 | +| time_elapsed | 588 | +| total_timesteps | 1097728 | +| train/ | | +| approx_kl | 0.0059733046 | +| clip_fraction | 0.0634 | +| clip_range | 0.2 | +| entropy_loss | -3.01 | +| explained_variance | 0.852 | +| learning_rate | 0.0003 | +| loss | -0.0254 | +| n_updates | 660 | +| policy_gradient_loss | -0.00452 | +| std | 1.09 | +| value_loss | 0.0395 | +------------------------------------------ +Eval num_timesteps=1100000, episode_reward=-33.30 +/- 26.65 +Episode length: 2000.00 +/- 0.00 +------------------------------------------ +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -33.3 | +| time/ | | +| total_timesteps | 1100000 | +| train/ | | +| approx_kl | 0.0054050894 | +| clip_fraction | 0.048 | +| clip_range | 0.2 | +| entropy_loss | -3.02 | +| explained_variance | 0.851 | +| learning_rate | 0.0003 | +| loss | -0.0348 | +| n_updates | 670 | +| policy_gradient_loss | -0.00385 | +| std | 1.1 | +| value_loss | 0.0247 | +------------------------------------------ +-------------------------------- +| time/ | | +| fps | 1843 | +| iterations | 68 | +| time_elapsed | 604 | +| total_timesteps | 1114112 | +-------------------------------- +------------------------------------------ +| time/ | | +| fps | 1856 | +| iterations | 69 | +| time_elapsed | 608 | +| total_timesteps | 1130496 | +| train/ | | +| approx_kl | 0.0073612374 | +| clip_fraction | 0.076 | +| clip_range | 0.2 | +| entropy_loss | -3.01 | +| explained_variance | 0.885 | +| learning_rate | 0.0003 | +| loss | -0.0424 | +| n_updates | 680 | +| policy_gradient_loss | -0.00512 | +| std | 1.09 | +| value_loss | 0.0278 | +------------------------------------------ +------------------------------------------ +| time/ | | +| fps | 1869 | +| iterations | 70 | +| time_elapsed | 613 | +| total_timesteps | 1146880 | +| train/ | | +| approx_kl | 0.0063554104 | +| clip_fraction | 0.067 | +| clip_range | 0.2 | +| entropy_loss | -3.01 | +| explained_variance | 0.915 | +| learning_rate | 0.0003 | +| loss | -0.0302 | +| n_updates | 690 | +| policy_gradient_loss | -0.00577 | +| std | 1.09 | +| value_loss | 0.0116 | +------------------------------------------ +Eval num_timesteps=1150000, episode_reward=-26.91 +/- 26.08 +Episode length: 2000.00 +/- 0.00 +----------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -26.9 | +| time/ | | +| total_timesteps | 1150000 | +| train/ | | +| approx_kl | 0.006060633 | +| clip_fraction | 0.0603 | +| clip_range | 0.2 | +| entropy_loss | -3.02 | +| explained_variance | 0.905 | +| learning_rate | 0.0003 | +| loss | -0.0374 | +| n_updates | 700 | +| policy_gradient_loss | -0.00442 | +| std | 1.1 | +| value_loss | 0.0101 | +----------------------------------------- +-------------------------------- +| time/ | | +| fps | 1847 | +| iterations | 71 | +| time_elapsed | 629 | +| total_timesteps | 1163264 | +-------------------------------- +------------------------------------------ +| time/ | | +| fps | 1859 | +| iterations | 72 | +| time_elapsed | 634 | +| total_timesteps | 1179648 | +| train/ | | +| approx_kl | 0.0070389216 | +| clip_fraction | 0.0728 | +| clip_range | 0.2 | +| entropy_loss | -3.03 | +| explained_variance | 0.854 | +| learning_rate | 0.0003 | +| loss | -0.0409 | +| n_updates | 710 | +| policy_gradient_loss | -0.00505 | +| std | 1.1 | +| value_loss | 0.0196 | +------------------------------------------ +------------------------------------------ +| time/ | | +| fps | 1871 | +| iterations | 73 | +| time_elapsed | 638 | +| total_timesteps | 1196032 | +| train/ | | +| approx_kl | 0.0055403598 | +| clip_fraction | 0.0567 | +| clip_range | 0.2 | +| entropy_loss | -3.03 | +| explained_variance | 0.906 | +| learning_rate | 0.0003 | +| loss | -0.0324 | +| n_updates | 720 | +| policy_gradient_loss | -0.00494 | +| std | 1.1 | +| value_loss | 0.0109 | +------------------------------------------ +Eval num_timesteps=1200000, episode_reward=-23.57 +/- 26.30 +Episode length: 2000.00 +/- 0.00 +------------------------------------------ +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -23.6 | +| time/ | | +| total_timesteps | 1200000 | +| train/ | | +| approx_kl | 0.0055604624 | +| clip_fraction | 0.0522 | +| clip_range | 0.2 | +| entropy_loss | -3.02 | +| explained_variance | 0.819 | +| learning_rate | 0.0003 | +| loss | -0.00379 | +| n_updates | 730 | +| policy_gradient_loss | -0.00374 | +| std | 1.1 | +| value_loss | 0.0453 | +------------------------------------------ + +[Diag @ 1,200,000 | n_sheep=2 | success=0%] + COMPACT_CANT_DRIVE 15/20 + NEVER_COMPACT 4/20 + DROVE_NO_SHEEP 1/20 + action_mag mean=0.399 p10=0.067 p90=0.794 (0=stopped, 1=full speed) + min_flock_radius mean=2.96m best=0.00m (target <5m to compact) + min_dog_to_com mean=2.17m best=0.14m (FLEE_DIST=7m) + min_com_to_pen mean=11.07m best=2.66m + reward/step (mean): progress=+0.0064 alignment=+0.0000 pen_bonus=+0.0003 step_cost=-0.0200 complete=+0.0000 + +[Curriculum] leaving stage n_sheep=2 after 600,000 steps | training success rate (last 100 eps) = 0% +[Curriculum] → 3 sheep at step 1,200,000 + +-------------------------------- +| time/ | | +| fps | 1828 | +| iterations | 74 | +| time_elapsed | 663 | +| total_timesteps | 1212416 | +-------------------------------- +----------------------------------------- +| time/ | | +| fps | 1839 | +| iterations | 75 | +| time_elapsed | 668 | +| total_timesteps | 1228800 | +| train/ | | +| approx_kl | 0.007044647 | +| clip_fraction | 0.0819 | +| clip_range | 0.2 | +| entropy_loss | -3.02 | +| explained_variance | 0.902 | +| learning_rate | 0.0003 | +| loss | -0.00823 | +| n_updates | 740 | +| policy_gradient_loss | -0.00327 | +| std | 1.1 | +| value_loss | 0.042 | +----------------------------------------- +------------------------------------------ +| time/ | | +| fps | 1849 | +| iterations | 76 | +| time_elapsed | 673 | +| total_timesteps | 1245184 | +| train/ | | +| approx_kl | 0.0064169513 | +| clip_fraction | 0.0699 | +| clip_range | 0.2 | +| entropy_loss | -3.03 | +| explained_variance | 0.928 | +| learning_rate | 0.0003 | +| loss | -0.0323 | +| n_updates | 750 | +| policy_gradient_loss | -0.00459 | +| std | 1.1 | +| value_loss | 0.0102 | +------------------------------------------ +Eval num_timesteps=1250000, episode_reward=-27.97 +/- 37.55 +Episode length: 2000.00 +/- 0.00 +----------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -28 | +| time/ | | +| total_timesteps | 1250000 | +| train/ | | +| approx_kl | 0.006859841 | +| clip_fraction | 0.0783 | +| clip_range | 0.2 | +| entropy_loss | -3.04 | +| explained_variance | 0.94 | +| learning_rate | 0.0003 | +| loss | -0.0368 | +| n_updates | 760 | +| policy_gradient_loss | -0.00472 | +| std | 1.11 | +| value_loss | 0.00931 | +----------------------------------------- +-------------------------------- +| time/ | | +| fps | 1825 | +| iterations | 77 | +| time_elapsed | 691 | +| total_timesteps | 1261568 | +-------------------------------- +------------------------------------------ +| time/ | | +| fps | 1836 | +| iterations | 78 | +| time_elapsed | 696 | +| total_timesteps | 1277952 | +| train/ | | +| approx_kl | 0.0066901552 | +| clip_fraction | 0.0704 | +| clip_range | 0.2 | +| entropy_loss | -3.04 | +| explained_variance | 0.942 | +| learning_rate | 0.0003 | +| loss | -0.0329 | +| n_updates | 770 | +| policy_gradient_loss | -0.00458 | +| std | 1.11 | +| value_loss | 0.00938 | +------------------------------------------ +----------------------------------------- +| time/ | | +| fps | 1845 | +| iterations | 79 | +| time_elapsed | 701 | +| total_timesteps | 1294336 | +| train/ | | +| approx_kl | 0.007008245 | +| clip_fraction | 0.082 | +| clip_range | 0.2 | +| entropy_loss | -3.03 | +| explained_variance | 0.899 | +| learning_rate | 0.0003 | +| loss | -0.0194 | +| n_updates | 780 | +| policy_gradient_loss | -0.00426 | +| std | 1.1 | +| value_loss | 0.052 | +----------------------------------------- +Eval num_timesteps=1300000, episode_reward=-41.12 +/- 37.68 +Episode length: 2000.00 +/- 0.00 +------------------------------------------ +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -41.1 | +| time/ | | +| total_timesteps | 1300000 | +| train/ | | +| approx_kl | 0.0070775724 | +| clip_fraction | 0.0742 | +| clip_range | 0.2 | +| entropy_loss | -3.03 | +| explained_variance | 0.942 | +| learning_rate | 0.0003 | +| loss | -0.0238 | +| n_updates | 790 | +| policy_gradient_loss | -0.0052 | +| std | 1.11 | +| value_loss | 0.00657 | +------------------------------------------ +-------------------------------- +| time/ | | +| fps | 1823 | +| iterations | 80 | +| time_elapsed | 718 | +| total_timesteps | 1310720 | +-------------------------------- +----------------------------------------- +| time/ | | +| fps | 1832 | +| iterations | 81 | +| time_elapsed | 724 | +| total_timesteps | 1327104 | +| train/ | | +| approx_kl | 0.008046751 | +| clip_fraction | 0.0851 | +| clip_range | 0.2 | +| entropy_loss | -3.04 | +| explained_variance | 0.897 | +| learning_rate | 0.0003 | +| loss | -0.0384 | +| n_updates | 800 | +| policy_gradient_loss | -0.0057 | +| std | 1.11 | +| value_loss | 0.009 | +----------------------------------------- +----------------------------------------- +| time/ | | +| fps | 1840 | +| iterations | 82 | +| time_elapsed | 730 | +| total_timesteps | 1343488 | +| train/ | | +| approx_kl | 0.006007643 | +| clip_fraction | 0.0548 | +| clip_range | 0.2 | +| entropy_loss | -3.06 | +| explained_variance | 0.871 | +| learning_rate | 0.0003 | +| loss | -0.0251 | +| n_updates | 810 | +| policy_gradient_loss | -0.00416 | +| std | 1.12 | +| value_loss | 0.0179 | +----------------------------------------- +Eval num_timesteps=1350000, episode_reward=-24.46 +/- 41.24 +Episode length: 2000.00 +/- 0.00 +------------------------------------------ +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -24.5 | +| time/ | | +| total_timesteps | 1350000 | +| train/ | | +| approx_kl | 0.0065572546 | +| clip_fraction | 0.0698 | +| clip_range | 0.2 | +| entropy_loss | -3.08 | +| explained_variance | 0.877 | +| learning_rate | 0.0003 | +| loss | -0.0219 | +| n_updates | 820 | +| policy_gradient_loss | -0.00456 | +| std | 1.13 | +| value_loss | 0.0242 | +------------------------------------------ + +[Diag @ 1,350,000 | n_sheep=3 | success=0%] + NEVER_COMPACT 14/20 + COMPACT_CANT_DRIVE 6/20 + action_mag mean=0.195 p10=0.018 p90=0.576 (0=stopped, 1=full speed) + min_flock_radius mean=6.32m best=1.36m (target <5m to compact) + min_dog_to_com mean=4.15m best=0.61m (FLEE_DIST=7m) + min_com_to_pen mean=11.37m best=4.88m + reward/step (mean): progress=+0.0029 alignment=+0.0000 pen_bonus=+0.0000 step_cost=-0.0200 complete=+0.0000 +-------------------------------- +| time/ | | +| fps | 1798 | +| iterations | 83 | +| time_elapsed | 756 | +| total_timesteps | 1359872 | +-------------------------------- +------------------------------------------ +| time/ | | +| fps | 1809 | +| iterations | 84 | +| time_elapsed | 760 | +| total_timesteps | 1376256 | +| train/ | | +| approx_kl | 0.0072198315 | +| clip_fraction | 0.0764 | +| clip_range | 0.2 | +| entropy_loss | -3.08 | +| explained_variance | 0.909 | +| learning_rate | 0.0003 | +| loss | -0.0208 | +| n_updates | 830 | +| policy_gradient_loss | -0.00626 | +| std | 1.13 | +| value_loss | 0.0106 | +------------------------------------------ +------------------------------------------ +| time/ | | +| fps | 1817 | +| iterations | 85 | +| time_elapsed | 766 | +| total_timesteps | 1392640 | +| train/ | | +| approx_kl | 0.0070813587 | +| clip_fraction | 0.0733 | +| clip_range | 0.2 | +| entropy_loss | -3.08 | +| explained_variance | 0.907 | +| learning_rate | 0.0003 | +| loss | -0.0324 | +| n_updates | 840 | +| policy_gradient_loss | -0.00505 | +| std | 1.13 | +| value_loss | 0.0166 | +------------------------------------------ +Eval num_timesteps=1400000, episode_reward=-36.32 +/- 33.15 +Episode length: 2000.00 +/- 0.00 +------------------------------------------ +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -36.3 | +| time/ | | +| total_timesteps | 1400000 | +| train/ | | +| approx_kl | 0.0067584305 | +| clip_fraction | 0.08 | +| clip_range | 0.2 | +| entropy_loss | -3.08 | +| explained_variance | 0.906 | +| learning_rate | 0.0003 | +| loss | -0.0308 | +| n_updates | 850 | +| policy_gradient_loss | -0.0054 | +| std | 1.13 | +| value_loss | 0.0112 | +------------------------------------------ +-------------------------------- +| time/ | | +| fps | 1798 | +| iterations | 86 | +| time_elapsed | 783 | +| total_timesteps | 1409024 | +-------------------------------- +----------------------------------------- +| time/ | | +| fps | 1807 | +| iterations | 87 | +| time_elapsed | 788 | +| total_timesteps | 1425408 | +| train/ | | +| approx_kl | 0.007411341 | +| clip_fraction | 0.0716 | +| clip_range | 0.2 | +| entropy_loss | -3.09 | +| explained_variance | 0.904 | +| learning_rate | 0.0003 | +| loss | -0.0322 | +| n_updates | 860 | +| policy_gradient_loss | -0.00641 | +| std | 1.14 | +| value_loss | 0.0191 | +----------------------------------------- +------------------------------------------ +| time/ | | +| fps | 1815 | +| iterations | 88 | +| time_elapsed | 794 | +| total_timesteps | 1441792 | +| train/ | | +| approx_kl | 0.0077011855 | +| clip_fraction | 0.0774 | +| clip_range | 0.2 | +| entropy_loss | -3.09 | +| explained_variance | 0.914 | +| learning_rate | 0.0003 | +| loss | -0.0316 | +| n_updates | 870 | +| policy_gradient_loss | -0.00545 | +| std | 1.13 | +| value_loss | 0.0148 | +------------------------------------------ +Eval num_timesteps=1450000, episode_reward=-40.58 +/- 38.17 +Episode length: 2000.00 +/- 0.00 +----------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -40.6 | +| time/ | | +| total_timesteps | 1450000 | +| train/ | | +| approx_kl | 0.007694071 | +| clip_fraction | 0.0816 | +| clip_range | 0.2 | +| entropy_loss | -3.07 | +| explained_variance | 0.937 | +| learning_rate | 0.0003 | +| loss | -0.036 | +| n_updates | 880 | +| policy_gradient_loss | -0.0054 | +| std | 1.12 | +| value_loss | 0.0111 | +----------------------------------------- +-------------------------------- +| time/ | | +| fps | 1796 | +| iterations | 89 | +| time_elapsed | 811 | +| total_timesteps | 1458176 | +-------------------------------- +----------------------------------------- +| time/ | | +| fps | 1805 | +| iterations | 90 | +| time_elapsed | 816 | +| total_timesteps | 1474560 | +| train/ | | +| approx_kl | 0.007034345 | +| clip_fraction | 0.0693 | +| clip_range | 0.2 | +| entropy_loss | -3.07 | +| explained_variance | 0.924 | +| learning_rate | 0.0003 | +| loss | 0.0472 | +| n_updates | 890 | +| policy_gradient_loss | -0.00472 | +| std | 1.13 | +| value_loss | 0.0352 | +----------------------------------------- +------------------------------------------ +| time/ | | +| fps | 1815 | +| iterations | 91 | +| time_elapsed | 821 | +| total_timesteps | 1490944 | +| train/ | | +| approx_kl | 0.0078114523 | +| clip_fraction | 0.0917 | +| clip_range | 0.2 | +| entropy_loss | -3.08 | +| explained_variance | 0.942 | +| learning_rate | 0.0003 | +| loss | -0.0461 | +| n_updates | 900 | +| policy_gradient_loss | -0.00668 | +| std | 1.13 | +| value_loss | 0.00844 | +------------------------------------------ +Eval num_timesteps=1500000, episode_reward=-19.66 +/- 25.98 +Episode length: 2000.00 +/- 0.00 +------------------------------------------ +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -19.7 | +| time/ | | +| total_timesteps | 1500000 | +| train/ | | +| approx_kl | 0.0067999987 | +| clip_fraction | 0.0606 | +| clip_range | 0.2 | +| entropy_loss | -3.08 | +| explained_variance | 0.893 | +| learning_rate | 0.0003 | +| loss | -0.0283 | +| n_updates | 910 | +| policy_gradient_loss | -0.00385 | +| std | 1.12 | +| value_loss | 0.0409 | +------------------------------------------ +New best mean reward! + +[Diag @ 1,500,000 | n_sheep=3 | success=0%] + COMPACT_CANT_DRIVE 11/20 + NEVER_COMPACT 7/20 + DROVE_NO_SHEEP 2/20 + action_mag mean=0.185 p10=0.015 p90=0.426 (0=stopped, 1=full speed) + min_flock_radius mean=4.43m best=1.38m (target <5m to compact) + min_dog_to_com mean=2.89m best=0.07m (FLEE_DIST=7m) + min_com_to_pen mean=11.88m best=2.23m + reward/step (mean): progress=+0.0008 alignment=+0.0000 pen_bonus=+0.0003 step_cost=-0.0200 complete=+0.0000 +-------------------------------- +| time/ | | +| fps | 1781 | +| iterations | 92 | +| time_elapsed | 846 | +| total_timesteps | 1507328 | +-------------------------------- +------------------------------------------ +| time/ | | +| fps | 1789 | +| iterations | 93 | +| time_elapsed | 851 | +| total_timesteps | 1523712 | +| train/ | | +| approx_kl | 0.0069550863 | +| clip_fraction | 0.0787 | +| clip_range | 0.2 | +| entropy_loss | -3.08 | +| explained_variance | 0.897 | +| learning_rate | 0.0003 | +| loss | -0.0204 | +| n_updates | 920 | +| policy_gradient_loss | -0.00394 | +| std | 1.13 | +| value_loss | 0.0324 | +------------------------------------------ +----------------------------------------- +| time/ | | +| fps | 1798 | +| iterations | 94 | +| time_elapsed | 856 | +| total_timesteps | 1540096 | +| train/ | | +| approx_kl | 0.006749108 | +| clip_fraction | 0.0787 | +| clip_range | 0.2 | +| entropy_loss | -3.08 | +| explained_variance | 0.929 | +| learning_rate | 0.0003 | +| loss | -0.0338 | +| n_updates | 930 | +| policy_gradient_loss | -0.00534 | +| std | 1.13 | +| value_loss | 0.00967 | +----------------------------------------- +Eval num_timesteps=1550000, episode_reward=-26.47 +/- 25.94 +Episode length: 2000.00 +/- 0.00 +------------------------------------------ +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -26.5 | +| time/ | | +| total_timesteps | 1550000 | +| train/ | | +| approx_kl | 0.0073381998 | +| clip_fraction | 0.0679 | +| clip_range | 0.2 | +| entropy_loss | -3.08 | +| explained_variance | 0.919 | +| learning_rate | 0.0003 | +| loss | -0.0259 | +| n_updates | 940 | +| policy_gradient_loss | -0.00554 | +| std | 1.13 | +| value_loss | 0.00999 | +------------------------------------------ +-------------------------------- +| time/ | | +| fps | 1782 | +| iterations | 95 | +| time_elapsed | 873 | +| total_timesteps | 1556480 | +-------------------------------- +------------------------------------------ +| time/ | | +| fps | 1790 | +| iterations | 96 | +| time_elapsed | 878 | +| total_timesteps | 1572864 | +| train/ | | +| approx_kl | 0.0071112993 | +| clip_fraction | 0.0781 | +| clip_range | 0.2 | +| entropy_loss | -3.08 | +| explained_variance | 0.929 | +| learning_rate | 0.0003 | +| loss | -0.0324 | +| n_updates | 950 | +| policy_gradient_loss | -0.00428 | +| std | 1.13 | +| value_loss | 0.0246 | +------------------------------------------ +------------------------------------------ +| time/ | | +| fps | 1798 | +| iterations | 97 | +| time_elapsed | 883 | +| total_timesteps | 1589248 | +| train/ | | +| approx_kl | 0.0077134473 | +| clip_fraction | 0.0784 | +| clip_range | 0.2 | +| entropy_loss | -3.08 | +| explained_variance | 0.917 | +| learning_rate | 0.0003 | +| loss | -0.0365 | +| n_updates | 960 | +| policy_gradient_loss | -0.00445 | +| std | 1.13 | +| value_loss | 0.0122 | +------------------------------------------ +Eval num_timesteps=1600000, episode_reward=-35.13 +/- 31.01 +Episode length: 2000.00 +/- 0.00 +------------------------------------------ +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -35.1 | +| time/ | | +| total_timesteps | 1600000 | +| train/ | | +| approx_kl | 0.0070123896 | +| clip_fraction | 0.0712 | +| clip_range | 0.2 | +| entropy_loss | -3.07 | +| explained_variance | 0.919 | +| learning_rate | 0.0003 | +| loss | -0.026 | +| n_updates | 970 | +| policy_gradient_loss | -0.00519 | +| std | 1.13 | +| value_loss | 0.0171 | +------------------------------------------ +-------------------------------- +| time/ | | +| fps | 1781 | +| iterations | 98 | +| time_elapsed | 901 | +| total_timesteps | 1605632 | +-------------------------------- +----------------------------------------- +| time/ | | +| fps | 1789 | +| iterations | 99 | +| time_elapsed | 906 | +| total_timesteps | 1622016 | +| train/ | | +| approx_kl | 0.007990176 | +| clip_fraction | 0.0845 | +| clip_range | 0.2 | +| entropy_loss | -3.07 | +| explained_variance | 0.873 | +| learning_rate | 0.0003 | +| loss | -0.04 | +| n_updates | 980 | +| policy_gradient_loss | -0.0045 | +| std | 1.13 | +| value_loss | 0.0153 | +----------------------------------------- +----------------------------------------- +| time/ | | +| fps | 1798 | +| iterations | 100 | +| time_elapsed | 911 | +| total_timesteps | 1638400 | +| train/ | | +| approx_kl | 0.006477687 | +| clip_fraction | 0.0593 | +| clip_range | 0.2 | +| entropy_loss | -3.07 | +| explained_variance | 0.946 | +| learning_rate | 0.0003 | +| loss | -0.0396 | +| n_updates | 990 | +| policy_gradient_loss | -0.00442 | +| std | 1.13 | +| value_loss | 0.0107 | +----------------------------------------- +Eval num_timesteps=1650000, episode_reward=-31.86 +/- 47.05 +Episode length: 2000.00 +/- 0.00 +----------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -31.9 | +| time/ | | +| total_timesteps | 1650000 | +| train/ | | +| approx_kl | 0.006796476 | +| clip_fraction | 0.0672 | +| clip_range | 0.2 | +| entropy_loss | -3.08 | +| explained_variance | 0.929 | +| learning_rate | 0.0003 | +| loss | -0.0264 | +| n_updates | 1000 | +| policy_gradient_loss | -0.00375 | +| std | 1.13 | +| value_loss | 0.0385 | +----------------------------------------- + +[Diag @ 1,650,000 | n_sheep=3 | success=0%] + NEVER_COMPACT 11/20 + COMPACT_CANT_DRIVE 9/20 + action_mag mean=0.154 p10=0.005 p90=0.398 (0=stopped, 1=full speed) + min_flock_radius mean=5.81m best=0.00m (target <5m to compact) + min_dog_to_com mean=3.22m best=0.52m (FLEE_DIST=7m) + min_com_to_pen mean=13.42m best=7.08m + reward/step (mean): progress=+0.0061 alignment=+0.0000 pen_bonus=+0.0010 step_cost=-0.0200 complete=+0.0000 +-------------------------------- +| time/ | | +| fps | 1768 | +| iterations | 101 | +| time_elapsed | 935 | +| total_timesteps | 1654784 | +-------------------------------- +---------------------------------------- +| time/ | | +| fps | 1774 | +| iterations | 102 | +| time_elapsed | 941 | +| total_timesteps | 1671168 | +| train/ | | +| approx_kl | 0.00682881 | +| clip_fraction | 0.0694 | +| clip_range | 0.2 | +| entropy_loss | -3.08 | +| explained_variance | 0.939 | +| learning_rate | 0.0003 | +| loss | -0.0233 | +| n_updates | 1010 | +| policy_gradient_loss | -0.00461 | +| std | 1.13 | +| value_loss | 0.0183 | +---------------------------------------- +------------------------------------------ +| time/ | | +| fps | 1779 | +| iterations | 103 | +| time_elapsed | 948 | +| total_timesteps | 1687552 | +| train/ | | +| approx_kl | 0.0071003223 | +| clip_fraction | 0.0782 | +| clip_range | 0.2 | +| entropy_loss | -3.1 | +| explained_variance | 0.923 | +| learning_rate | 0.0003 | +| loss | -0.0398 | +| n_updates | 1020 | +| policy_gradient_loss | -0.00491 | +| std | 1.15 | +| value_loss | 0.0101 | +------------------------------------------ +Eval num_timesteps=1700000, episode_reward=-32.11 +/- 36.59 +Episode length: 2000.00 +/- 0.00 +------------------------------------------ +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -32.1 | +| time/ | | +| total_timesteps | 1700000 | +| train/ | | +| approx_kl | 0.0064870613 | +| clip_fraction | 0.0624 | +| clip_range | 0.2 | +| entropy_loss | -3.13 | +| explained_variance | 0.909 | +| learning_rate | 0.0003 | +| loss | -0.0365 | +| n_updates | 1030 | +| policy_gradient_loss | -0.00404 | +| std | 1.17 | +| value_loss | 0.00855 | +------------------------------------------ +-------------------------------- +| time/ | | +| fps | 1762 | +| iterations | 104 | +| time_elapsed | 966 | +| total_timesteps | 1703936 | +-------------------------------- +----------------------------------------- +| time/ | | +| fps | 1769 | +| iterations | 105 | +| time_elapsed | 972 | +| total_timesteps | 1720320 | +| train/ | | +| approx_kl | 0.007349294 | +| clip_fraction | 0.0833 | +| clip_range | 0.2 | +| entropy_loss | -3.15 | +| explained_variance | 0.926 | +| learning_rate | 0.0003 | +| loss | -0.0358 | +| n_updates | 1040 | +| policy_gradient_loss | -0.00514 | +| std | 1.17 | +| value_loss | 0.00848 | +----------------------------------------- +------------------------------------------ +| time/ | | +| fps | 1777 | +| iterations | 106 | +| time_elapsed | 976 | +| total_timesteps | 1736704 | +| train/ | | +| approx_kl | 0.0070306472 | +| clip_fraction | 0.0814 | +| clip_range | 0.2 | +| entropy_loss | -3.15 | +| explained_variance | 0.887 | +| learning_rate | 0.0003 | +| loss | -0.0359 | +| n_updates | 1050 | +| policy_gradient_loss | -0.00489 | +| std | 1.17 | +| value_loss | 0.0134 | +------------------------------------------ +Eval num_timesteps=1750000, episode_reward=-34.24 +/- 43.23 +Episode length: 2000.00 +/- 0.00 +----------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -34.2 | +| time/ | | +| total_timesteps | 1750000 | +| train/ | | +| approx_kl | 0.008487761 | +| clip_fraction | 0.102 | +| clip_range | 0.2 | +| entropy_loss | -3.15 | +| explained_variance | 0.962 | +| learning_rate | 0.0003 | +| loss | -0.0369 | +| n_updates | 1060 | +| policy_gradient_loss | -0.0077 | +| std | 1.17 | +| value_loss | 0.00786 | +----------------------------------------- +-------------------------------- +| time/ | | +| fps | 1762 | +| iterations | 107 | +| time_elapsed | 994 | +| total_timesteps | 1753088 | +-------------------------------- +------------------------------------------ +| time/ | | +| fps | 1766 | +| iterations | 108 | +| time_elapsed | 1001 | +| total_timesteps | 1769472 | +| train/ | | +| approx_kl | 0.0074267983 | +| clip_fraction | 0.0742 | +| clip_range | 0.2 | +| entropy_loss | -3.15 | +| explained_variance | 0.939 | +| learning_rate | 0.0003 | +| loss | -0.0404 | +| n_updates | 1070 | +| policy_gradient_loss | -0.00575 | +| std | 1.18 | +| value_loss | 0.0158 | +------------------------------------------ +------------------------------------------ +| time/ | | +| fps | 1772 | +| iterations | 109 | +| time_elapsed | 1007 | +| total_timesteps | 1785856 | +| train/ | | +| approx_kl | 0.0075380025 | +| clip_fraction | 0.074 | +| clip_range | 0.2 | +| entropy_loss | -3.15 | +| explained_variance | 0.961 | +| learning_rate | 0.0003 | +| loss | -0.034 | +| n_updates | 1080 | +| policy_gradient_loss | -0.00553 | +| std | 1.17 | +| value_loss | 0.00651 | +------------------------------------------ +Eval num_timesteps=1800000, episode_reward=-31.16 +/- 37.32 +Episode length: 2000.00 +/- 0.00 +----------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -31.2 | +| time/ | | +| total_timesteps | 1800000 | +| train/ | | +| approx_kl | 0.007386248 | +| clip_fraction | 0.0843 | +| clip_range | 0.2 | +| entropy_loss | -3.15 | +| explained_variance | 0.922 | +| learning_rate | 0.0003 | +| loss | -0.0419 | +| n_updates | 1090 | +| policy_gradient_loss | -0.00596 | +| std | 1.17 | +| value_loss | 0.00858 | +----------------------------------------- + +[Diag @ 1,800,000 | n_sheep=3 | success=0%] + NEVER_COMPACT 17/20 + COMPACT_CANT_DRIVE 3/20 + action_mag mean=0.164 p10=0.007 p90=0.418 (0=stopped, 1=full speed) + min_flock_radius mean=7.52m best=2.00m (target <5m to compact) + min_dog_to_com mean=2.24m best=0.21m (FLEE_DIST=7m) + min_com_to_pen mean=12.87m best=3.90m + reward/step (mean): progress=-0.0007 alignment=+0.0000 pen_bonus=+0.0005 step_cost=-0.0200 complete=+0.0000 + +[Curriculum] leaving stage n_sheep=3 after 600,000 steps | training success rate (last 100 eps) = 0% +[Curriculum] → 4 sheep at step 1,800,000 + +-------------------------------- +| time/ | | +| fps | 1743 | +| iterations | 110 | +| time_elapsed | 1033 | +| total_timesteps | 1802240 | +-------------------------------- +----------------------------------------- +| time/ | | +| fps | 1749 | +| iterations | 111 | +| time_elapsed | 1039 | +| total_timesteps | 1818624 | +| train/ | | +| approx_kl | 0.009158293 | +| clip_fraction | 0.0991 | +| clip_range | 0.2 | +| entropy_loss | -3.15 | +| explained_variance | 0.893 | +| learning_rate | 0.0003 | +| loss | -0.0414 | +| n_updates | 1100 | +| policy_gradient_loss | -0.00701 | +| std | 1.17 | +| value_loss | 0.0237 | +----------------------------------------- +----------------------------------------- +| time/ | | +| fps | 1755 | +| iterations | 112 | +| time_elapsed | 1045 | +| total_timesteps | 1835008 | +| train/ | | +| approx_kl | 0.007241189 | +| clip_fraction | 0.0831 | +| clip_range | 0.2 | +| entropy_loss | -3.15 | +| explained_variance | 0.874 | +| learning_rate | 0.0003 | +| loss | -0.0241 | +| n_updates | 1110 | +| policy_gradient_loss | -0.00634 | +| std | 1.17 | +| value_loss | 0.0226 | +----------------------------------------- +Eval num_timesteps=1850000, episode_reward=-29.45 +/- 31.10 +Episode length: 2000.00 +/- 0.00 +--------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -29.5 | +| time/ | | +| total_timesteps | 1850000 | +| train/ | | +| approx_kl | 0.0078688 | +| clip_fraction | 0.0777 | +| clip_range | 0.2 | +| entropy_loss | -3.15 | +| explained_variance | 0.895 | +| learning_rate | 0.0003 | +| loss | -0.036 | +| n_updates | 1120 | +| policy_gradient_loss | -0.00602 | +| std | 1.17 | +| value_loss | 0.0128 | +--------------------------------------- +-------------------------------- +| time/ | | +| fps | 1742 | +| iterations | 113 | +| time_elapsed | 1062 | +| total_timesteps | 1851392 | +-------------------------------- +----------------------------------------- +| time/ | | +| fps | 1749 | +| iterations | 114 | +| time_elapsed | 1067 | +| total_timesteps | 1867776 | +| train/ | | +| approx_kl | 0.008158936 | +| clip_fraction | 0.0963 | +| clip_range | 0.2 | +| entropy_loss | -3.14 | +| explained_variance | 0.897 | +| learning_rate | 0.0003 | +| loss | -0.0324 | +| n_updates | 1130 | +| policy_gradient_loss | -0.00854 | +| std | 1.17 | +| value_loss | 0.0144 | +----------------------------------------- +------------------------------------------ +| time/ | | +| fps | 1754 | +| iterations | 115 | +| time_elapsed | 1073 | +| total_timesteps | 1884160 | +| train/ | | +| approx_kl | 0.0074978825 | +| clip_fraction | 0.0844 | +| clip_range | 0.2 | +| entropy_loss | -3.14 | +| explained_variance | 0.92 | +| learning_rate | 0.0003 | +| loss | -0.0246 | +| n_updates | 1140 | +| policy_gradient_loss | -0.00578 | +| std | 1.16 | +| value_loss | 0.0134 | +------------------------------------------ +Eval num_timesteps=1900000, episode_reward=-38.21 +/- 31.08 +Episode length: 2000.00 +/- 0.00 +---------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -38.2 | +| time/ | | +| total_timesteps | 1900000 | +| train/ | | +| approx_kl | 0.00678163 | +| clip_fraction | 0.0711 | +| clip_range | 0.2 | +| entropy_loss | -3.15 | +| explained_variance | 0.892 | +| learning_rate | 0.0003 | +| loss | -0.0345 | +| n_updates | 1150 | +| policy_gradient_loss | -0.00409 | +| std | 1.18 | +| value_loss | 0.0221 | +---------------------------------------- +-------------------------------- +| time/ | | +| fps | 1740 | +| iterations | 116 | +| time_elapsed | 1091 | +| total_timesteps | 1900544 | +-------------------------------- +----------------------------------------- +| time/ | | +| fps | 1746 | +| iterations | 117 | +| time_elapsed | 1097 | +| total_timesteps | 1916928 | +| train/ | | +| approx_kl | 0.006992462 | +| clip_fraction | 0.0731 | +| clip_range | 0.2 | +| entropy_loss | -3.16 | +| explained_variance | 0.895 | +| learning_rate | 0.0003 | +| loss | -0.0243 | +| n_updates | 1160 | +| policy_gradient_loss | -0.00588 | +| std | 1.18 | +| value_loss | 0.0145 | +----------------------------------------- +------------------------------------------ +| time/ | | +| fps | 1750 | +| iterations | 118 | +| time_elapsed | 1104 | +| total_timesteps | 1933312 | +| train/ | | +| approx_kl | 0.0069225584 | +| clip_fraction | 0.068 | +| clip_range | 0.2 | +| entropy_loss | -3.15 | +| explained_variance | 0.905 | +| learning_rate | 0.0003 | +| loss | -0.0297 | +| n_updates | 1170 | +| policy_gradient_loss | -0.00516 | +| std | 1.17 | +| value_loss | 0.0153 | +------------------------------------------ +----------------------------------------- +| time/ | | +| fps | 1756 | +| iterations | 119 | +| time_elapsed | 1109 | +| total_timesteps | 1949696 | +| train/ | | +| approx_kl | 0.005966103 | +| clip_fraction | 0.059 | +| clip_range | 0.2 | +| entropy_loss | -3.15 | +| explained_variance | 0.896 | +| learning_rate | 0.0003 | +| loss | -0.0337 | +| n_updates | 1180 | +| policy_gradient_loss | -0.00413 | +| std | 1.17 | +| value_loss | 0.0091 | +----------------------------------------- +Eval num_timesteps=1950000, episode_reward=-59.72 +/- 38.15 +Episode length: 2000.00 +/- 0.00 +------------------------------------------ +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -59.7 | +| time/ | | +| total_timesteps | 1950000 | +| train/ | | +| approx_kl | 0.0067311125 | +| clip_fraction | 0.0733 | +| clip_range | 0.2 | +| entropy_loss | -3.16 | +| explained_variance | 0.861 | +| learning_rate | 0.0003 | +| loss | -0.0147 | +| n_updates | 1190 | +| policy_gradient_loss | -0.00459 | +| std | 1.18 | +| value_loss | 0.0083 | +------------------------------------------ + +[Diag @ 1,950,000 | n_sheep=4 | success=0%] + NEVER_COMPACT 14/20 + COMPACT_CANT_DRIVE 6/20 + action_mag mean=0.325 p10=0.025 p90=0.778 (0=stopped, 1=full speed) + min_flock_radius mean=7.27m best=2.17m (target <5m to compact) + min_dog_to_com mean=3.74m best=0.07m (FLEE_DIST=7m) + min_com_to_pen mean=13.01m best=6.24m + reward/step (mean): progress=+0.0026 alignment=+0.0000 pen_bonus=+0.0005 step_cost=-0.0200 complete=+0.0000 +-------------------------------- +| time/ | | +| fps | 1728 | +| iterations | 120 | +| time_elapsed | 1137 | +| total_timesteps | 1966080 | +-------------------------------- +------------------------------------------ +| time/ | | +| fps | 1734 | +| iterations | 121 | +| time_elapsed | 1143 | +| total_timesteps | 1982464 | +| train/ | | +| approx_kl | 0.0061555626 | +| clip_fraction | 0.0631 | +| clip_range | 0.2 | +| entropy_loss | -3.17 | +| explained_variance | 0.932 | +| learning_rate | 0.0003 | +| loss | -0.0328 | +| n_updates | 1200 | +| policy_gradient_loss | -0.00446 | +| std | 1.19 | +| value_loss | 0.0133 | +------------------------------------------ +------------------------------------------ +| time/ | | +| fps | 1739 | +| iterations | 122 | +| time_elapsed | 1149 | +| total_timesteps | 1998848 | +| train/ | | +| approx_kl | 0.0060347347 | +| clip_fraction | 0.057 | +| clip_range | 0.2 | +| entropy_loss | -3.18 | +| explained_variance | 0.841 | +| learning_rate | 0.0003 | +| loss | -0.0352 | +| n_updates | 1210 | +| policy_gradient_loss | -0.00322 | +| std | 1.19 | +| value_loss | 0.0104 | +------------------------------------------ +Eval num_timesteps=2000000, episode_reward=-37.97 +/- 46.26 +Episode length: 2000.00 +/- 0.00 +------------------------------------------ +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -38 | +| time/ | | +| total_timesteps | 2000000 | +| train/ | | +| approx_kl | 0.0063244104 | +| clip_fraction | 0.0675 | +| clip_range | 0.2 | +| entropy_loss | -3.18 | +| explained_variance | 0.865 | +| learning_rate | 0.0003 | +| loss | -0.0217 | +| n_updates | 1220 | +| policy_gradient_loss | -0.00489 | +| std | 1.2 | +| value_loss | 0.0219 | +------------------------------------------ +-------------------------------- +| time/ | | +| fps | 1725 | +| iterations | 123 | +| time_elapsed | 1167 | +| total_timesteps | 2015232 | +-------------------------------- +----------------------------------------- +| time/ | | +| fps | 1730 | +| iterations | 124 | +| time_elapsed | 1173 | +| total_timesteps | 2031616 | +| train/ | | +| approx_kl | 0.007022621 | +| clip_fraction | 0.0816 | +| clip_range | 0.2 | +| entropy_loss | -3.19 | +| explained_variance | 0.949 | +| learning_rate | 0.0003 | +| loss | -0.0248 | +| n_updates | 1230 | +| policy_gradient_loss | -0.0053 | +| std | 1.19 | +| value_loss | 0.00677 | +----------------------------------------- +----------------------------------------- +| time/ | | +| fps | 1735 | +| iterations | 125 | +| time_elapsed | 1179 | +| total_timesteps | 2048000 | +| train/ | | +| approx_kl | 0.006686856 | +| clip_fraction | 0.0653 | +| clip_range | 0.2 | +| entropy_loss | -3.18 | +| explained_variance | 0.928 | +| learning_rate | 0.0003 | +| loss | -0.0333 | +| n_updates | 1240 | +| policy_gradient_loss | -0.00445 | +| std | 1.19 | +| value_loss | 0.00651 | +----------------------------------------- +Eval num_timesteps=2050000, episode_reward=-27.67 +/- 36.42 +Episode length: 2000.00 +/- 0.00 +----------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -27.7 | +| time/ | | +| total_timesteps | 2050000 | +| train/ | | +| approx_kl | 0.006721792 | +| clip_fraction | 0.0675 | +| clip_range | 0.2 | +| entropy_loss | -3.2 | +| explained_variance | 0.921 | +| learning_rate | 0.0003 | +| loss | -0.0278 | +| n_updates | 1250 | +| policy_gradient_loss | -0.00408 | +| std | 1.21 | +| value_loss | 0.00793 | +----------------------------------------- +-------------------------------- +| time/ | | +| fps | 1721 | +| iterations | 126 | +| time_elapsed | 1198 | +| total_timesteps | 2064384 | +-------------------------------- +----------------------------------------- +| time/ | | +| fps | 1726 | +| iterations | 127 | +| time_elapsed | 1205 | +| total_timesteps | 2080768 | +| train/ | | +| approx_kl | 0.006730888 | +| clip_fraction | 0.0617 | +| clip_range | 0.2 | +| entropy_loss | -3.23 | +| explained_variance | 0.911 | +| learning_rate | 0.0003 | +| loss | -0.0276 | +| n_updates | 1260 | +| policy_gradient_loss | -0.00378 | +| std | 1.22 | +| value_loss | 0.00964 | +----------------------------------------- +----------------------------------------- +| time/ | | +| fps | 1732 | +| iterations | 128 | +| time_elapsed | 1210 | +| total_timesteps | 2097152 | +| train/ | | +| approx_kl | 0.007725292 | +| clip_fraction | 0.0775 | +| clip_range | 0.2 | +| entropy_loss | -3.23 | +| explained_variance | 0.913 | +| learning_rate | 0.0003 | +| loss | -0.0371 | +| n_updates | 1270 | +| policy_gradient_loss | -0.006 | +| std | 1.22 | +| value_loss | 0.0109 | +----------------------------------------- +Eval num_timesteps=2100000, episode_reward=-40.56 +/- 44.37 +Episode length: 2000.00 +/- 0.00 +------------------------------------------ +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -40.6 | +| time/ | | +| total_timesteps | 2100000 | +| train/ | | +| approx_kl | 0.0067186276 | +| clip_fraction | 0.0644 | +| clip_range | 0.2 | +| entropy_loss | -3.24 | +| explained_variance | 0.845 | +| learning_rate | 0.0003 | +| loss | -0.0357 | +| n_updates | 1280 | +| policy_gradient_loss | -0.00433 | +| std | 1.23 | +| value_loss | 0.0263 | +------------------------------------------ + +[Diag @ 2,100,000 | n_sheep=4 | success=0%] + NEVER_COMPACT 12/20 + COMPACT_CANT_DRIVE 8/20 + action_mag mean=0.384 p10=0.018 p90=0.884 (0=stopped, 1=full speed) + min_flock_radius mean=6.36m best=2.11m (target <5m to compact) + min_dog_to_com mean=2.94m best=0.40m (FLEE_DIST=7m) + min_com_to_pen mean=12.34m best=5.56m + reward/step (mean): progress=-0.0084 alignment=+0.0000 pen_bonus=+0.0003 step_cost=-0.0200 complete=+0.0000 +-------------------------------- +| time/ | | +| fps | 1706 | +| iterations | 129 | +| time_elapsed | 1238 | +| total_timesteps | 2113536 | +-------------------------------- +----------------------------------------- +| time/ | | +| fps | 1712 | +| iterations | 130 | +| time_elapsed | 1243 | +| total_timesteps | 2129920 | +| train/ | | +| approx_kl | 0.006317258 | +| clip_fraction | 0.0623 | +| clip_range | 0.2 | +| entropy_loss | -3.26 | +| explained_variance | 0.912 | +| learning_rate | 0.0003 | +| loss | -0.0419 | +| n_updates | 1290 | +| policy_gradient_loss | -0.00427 | +| std | 1.24 | +| value_loss | 0.00859 | +----------------------------------------- +---------------------------------------- +| time/ | | +| fps | 1716 | +| iterations | 131 | +| time_elapsed | 1250 | +| total_timesteps | 2146304 | +| train/ | | +| approx_kl | 0.00636432 | +| clip_fraction | 0.0698 | +| clip_range | 0.2 | +| entropy_loss | -3.28 | +| explained_variance | 0.851 | +| learning_rate | 0.0003 | +| loss | -0.0266 | +| n_updates | 1300 | +| policy_gradient_loss | -0.00374 | +| std | 1.25 | +| value_loss | 0.0299 | +---------------------------------------- +Eval num_timesteps=2150000, episode_reward=-63.32 +/- 33.74 +Episode length: 2000.00 +/- 0.00 +------------------------------------------ +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -63.3 | +| time/ | | +| total_timesteps | 2150000 | +| train/ | | +| approx_kl | 0.0060345423 | +| clip_fraction | 0.0563 | +| clip_range | 0.2 | +| entropy_loss | -3.27 | +| explained_variance | 0.898 | +| learning_rate | 0.0003 | +| loss | -0.0404 | +| n_updates | 1310 | +| policy_gradient_loss | -0.00356 | +| std | 1.24 | +| value_loss | 0.0205 | +------------------------------------------ +-------------------------------- +| time/ | | +| fps | 1704 | +| iterations | 132 | +| time_elapsed | 1268 | +| total_timesteps | 2162688 | +-------------------------------- +----------------------------------------- +| time/ | | +| fps | 1709 | +| iterations | 133 | +| time_elapsed | 1274 | +| total_timesteps | 2179072 | +| train/ | | +| approx_kl | 0.007027424 | +| clip_fraction | 0.0693 | +| clip_range | 0.2 | +| entropy_loss | -3.25 | +| explained_variance | 0.9 | +| learning_rate | 0.0003 | +| loss | -0.0315 | +| n_updates | 1320 | +| policy_gradient_loss | -0.00521 | +| std | 1.23 | +| value_loss | 0.0194 | +----------------------------------------- +----------------------------------------- +| time/ | | +| fps | 1715 | +| iterations | 134 | +| time_elapsed | 1279 | +| total_timesteps | 2195456 | +| train/ | | +| approx_kl | 0.006112649 | +| clip_fraction | 0.0635 | +| clip_range | 0.2 | +| entropy_loss | -3.24 | +| explained_variance | 0.957 | +| learning_rate | 0.0003 | +| loss | -0.0339 | +| n_updates | 1330 | +| policy_gradient_loss | -0.00383 | +| std | 1.23 | +| value_loss | 0.00861 | +----------------------------------------- +Eval num_timesteps=2200000, episode_reward=-31.28 +/- 44.80 +Episode length: 2000.00 +/- 0.00 +------------------------------------------ +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -31.3 | +| time/ | | +| total_timesteps | 2200000 | +| train/ | | +| approx_kl | 0.0070182728 | +| clip_fraction | 0.076 | +| clip_range | 0.2 | +| entropy_loss | -3.26 | +| explained_variance | 0.883 | +| learning_rate | 0.0003 | +| loss | -0.0412 | +| n_updates | 1340 | +| policy_gradient_loss | -0.00534 | +| std | 1.25 | +| value_loss | 0.013 | +------------------------------------------ +-------------------------------- +| time/ | | +| fps | 1704 | +| iterations | 135 | +| time_elapsed | 1297 | +| total_timesteps | 2211840 | +-------------------------------- +------------------------------------------ +| time/ | | +| fps | 1708 | +| iterations | 136 | +| time_elapsed | 1304 | +| total_timesteps | 2228224 | +| train/ | | +| approx_kl | 0.0062820893 | +| clip_fraction | 0.062 | +| clip_range | 0.2 | +| entropy_loss | -3.26 | +| explained_variance | 0.924 | +| learning_rate | 0.0003 | +| loss | -0.0377 | +| n_updates | 1350 | +| policy_gradient_loss | -0.00497 | +| std | 1.24 | +| value_loss | 0.00797 | +------------------------------------------ +------------------------------------------ +| time/ | | +| fps | 1713 | +| iterations | 137 | +| time_elapsed | 1310 | +| total_timesteps | 2244608 | +| train/ | | +| approx_kl | 0.0072454046 | +| clip_fraction | 0.0747 | +| clip_range | 0.2 | +| entropy_loss | -3.25 | +| explained_variance | 0.94 | +| learning_rate | 0.0003 | +| loss | -0.0366 | +| n_updates | 1360 | +| policy_gradient_loss | -0.00572 | +| std | 1.23 | +| value_loss | 0.00852 | +------------------------------------------ +Eval num_timesteps=2250000, episode_reward=-36.00 +/- 38.67 +Episode length: 2000.00 +/- 0.00 +----------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -36 | +| time/ | | +| total_timesteps | 2250000 | +| train/ | | +| approx_kl | 0.005690419 | +| clip_fraction | 0.0546 | +| clip_range | 0.2 | +| entropy_loss | -3.25 | +| explained_variance | 0.957 | +| learning_rate | 0.0003 | +| loss | -0.0376 | +| n_updates | 1370 | +| policy_gradient_loss | -0.00425 | +| std | 1.23 | +| value_loss | 0.00524 | +----------------------------------------- + +[Diag @ 2,250,000 | n_sheep=4 | success=0%] + NEVER_COMPACT 13/20 + COMPACT_CANT_DRIVE 7/20 + action_mag mean=0.416 p10=0.038 p90=0.887 (0=stopped, 1=full speed) + min_flock_radius mean=6.62m best=2.03m (target <5m to compact) + min_dog_to_com mean=3.54m best=0.40m (FLEE_DIST=7m) + min_com_to_pen mean=14.24m best=9.65m + reward/step (mean): progress=-0.0070 alignment=+0.0000 pen_bonus=+0.0005 step_cost=-0.0200 complete=+0.0000 +-------------------------------- +| time/ | | +| fps | 1690 | +| iterations | 138 | +| time_elapsed | 1337 | +| total_timesteps | 2260992 | +-------------------------------- +------------------------------------------ +| time/ | | +| fps | 1696 | +| iterations | 139 | +| time_elapsed | 1342 | +| total_timesteps | 2277376 | +| train/ | | +| approx_kl | 0.0072061084 | +| clip_fraction | 0.0728 | +| clip_range | 0.2 | +| entropy_loss | -3.25 | +| explained_variance | 0.954 | +| learning_rate | 0.0003 | +| loss | -0.0312 | +| n_updates | 1380 | +| policy_gradient_loss | -0.00512 | +| std | 1.23 | +| value_loss | 0.006 | +------------------------------------------ +------------------------------------------ +| time/ | | +| fps | 1702 | +| iterations | 140 | +| time_elapsed | 1347 | +| total_timesteps | 2293760 | +| train/ | | +| approx_kl | 0.0066916933 | +| clip_fraction | 0.0626 | +| clip_range | 0.2 | +| entropy_loss | -3.24 | +| explained_variance | 0.939 | +| learning_rate | 0.0003 | +| loss | -0.0408 | +| n_updates | 1390 | +| policy_gradient_loss | -0.00463 | +| std | 1.23 | +| value_loss | 0.00827 | +------------------------------------------ +Eval num_timesteps=2300000, episode_reward=-43.65 +/- 42.86 +Episode length: 2000.00 +/- 0.00 +------------------------------------------ +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -43.7 | +| time/ | | +| total_timesteps | 2300000 | +| train/ | | +| approx_kl | 0.0062987795 | +| clip_fraction | 0.0609 | +| clip_range | 0.2 | +| entropy_loss | -3.26 | +| explained_variance | 0.898 | +| learning_rate | 0.0003 | +| loss | -0.0316 | +| n_updates | 1400 | +| policy_gradient_loss | -0.00442 | +| std | 1.25 | +| value_loss | 0.00955 | +------------------------------------------ +-------------------------------- +| time/ | | +| fps | 1691 | +| iterations | 141 | +| time_elapsed | 1365 | +| total_timesteps | 2310144 | +-------------------------------- +----------------------------------------- +| time/ | | +| fps | 1696 | +| iterations | 142 | +| time_elapsed | 1371 | +| total_timesteps | 2326528 | +| train/ | | +| approx_kl | 0.005443076 | +| clip_fraction | 0.054 | +| clip_range | 0.2 | +| entropy_loss | -3.27 | +| explained_variance | 0.877 | +| learning_rate | 0.0003 | +| loss | -0.0296 | +| n_updates | 1410 | +| policy_gradient_loss | -0.00375 | +| std | 1.24 | +| value_loss | 0.00928 | +----------------------------------------- +----------------------------------------- +| time/ | | +| fps | 1701 | +| iterations | 143 | +| time_elapsed | 1376 | +| total_timesteps | 2342912 | +| train/ | | +| approx_kl | 0.004740049 | +| clip_fraction | 0.0456 | +| clip_range | 0.2 | +| entropy_loss | -3.26 | +| explained_variance | 0.922 | +| learning_rate | 0.0003 | +| loss | -0.0318 | +| n_updates | 1420 | +| policy_gradient_loss | -0.00351 | +| std | 1.24 | +| value_loss | 0.0156 | +----------------------------------------- +Eval num_timesteps=2350000, episode_reward=-37.57 +/- 37.78 +Episode length: 2000.00 +/- 0.00 +------------------------------------------ +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -37.6 | +| time/ | | +| total_timesteps | 2350000 | +| train/ | | +| approx_kl | 0.0056120222 | +| clip_fraction | 0.0542 | +| clip_range | 0.2 | +| entropy_loss | -3.27 | +| explained_variance | 0.911 | +| learning_rate | 0.0003 | +| loss | -0.0272 | +| n_updates | 1430 | +| policy_gradient_loss | -0.0035 | +| std | 1.25 | +| value_loss | 0.00811 | +------------------------------------------ +-------------------------------- +| time/ | | +| fps | 1690 | +| iterations | 144 | +| time_elapsed | 1395 | +| total_timesteps | 2359296 | +-------------------------------- +------------------------------------------ +| time/ | | +| fps | 1695 | +| iterations | 145 | +| time_elapsed | 1401 | +| total_timesteps | 2375680 | +| train/ | | +| approx_kl | 0.0064737825 | +| clip_fraction | 0.0697 | +| clip_range | 0.2 | +| entropy_loss | -3.28 | +| explained_variance | 0.93 | +| learning_rate | 0.0003 | +| loss | -0.036 | +| n_updates | 1440 | +| policy_gradient_loss | -0.00403 | +| std | 1.25 | +| value_loss | 0.00488 | +------------------------------------------ +------------------------------------------ +| time/ | | +| fps | 1699 | +| iterations | 146 | +| time_elapsed | 1407 | +| total_timesteps | 2392064 | +| train/ | | +| approx_kl | 0.0050720195 | +| clip_fraction | 0.0466 | +| clip_range | 0.2 | +| entropy_loss | -3.29 | +| explained_variance | 0.902 | +| learning_rate | 0.0003 | +| loss | -0.0374 | +| n_updates | 1450 | +| policy_gradient_loss | -0.00283 | +| std | 1.26 | +| value_loss | 0.00958 | +------------------------------------------ +Eval num_timesteps=2400000, episode_reward=-42.55 +/- 37.89 +Episode length: 2000.00 +/- 0.00 +----------------------------------------- +| eval/ | | +| mean_ep_length | 2e+03 | +| mean_reward | -42.6 | +| time/ | | +| total_timesteps | 2400000 | +| train/ | | +| approx_kl | 0.005990128 | +| clip_fraction | 0.0565 | +| clip_range | 0.2 | +| entropy_loss | -3.31 | +| explained_variance | 0.869 | +| learning_rate | 0.0003 | +| loss | -0.0448 | +| n_updates | 1460 | +| policy_gradient_loss | -0.0051 | +| std | 1.27 | +| value_loss | 0.00854 | +----------------------------------------- + +[Diag @ 2,400,000 | n_sheep=4 | success=0%] + NEVER_COMPACT 15/20 + COMPACT_CANT_DRIVE 5/20 + action_mag mean=0.424 p10=0.025 p90=0.948 (0=stopped, 1=full speed) + min_flock_radius mean=7.66m best=1.63m (target <5m to compact) + min_dog_to_com mean=4.77m best=0.32m (FLEE_DIST=7m) + min_com_to_pen mean=14.47m best=8.96m + reward/step (mean): progress=-0.0008 alignment=+0.0000 pen_bonus=+0.0003 step_cost=-0.0200 complete=+0.0000 +-------------------------------- +| time/ | | +| fps | 1677 | +| iterations | 147 | +| time_elapsed | 1435 | +| total_timesteps | 2408448 | +-------------------------------- + +Training complete. Artefacts saved to runs/ppo_fix_check/ diff --git a/training/runs/ppo_fix_check/best_model/best_model.zip b/training/runs/ppo_fix_check/best_model/best_model.zip new file mode 100644 index 0000000..8533c33 Binary files /dev/null and b/training/runs/ppo_fix_check/best_model/best_model.zip differ diff --git a/training/runs/ppo_fix_check/evaluations.npz b/training/runs/ppo_fix_check/evaluations.npz new file mode 100644 index 0000000..9ae65e5 Binary files /dev/null and b/training/runs/ppo_fix_check/evaluations.npz differ diff --git a/training/runs/ppo_fix_check/final_model.zip b/training/runs/ppo_fix_check/final_model.zip new file mode 100644 index 0000000..7e1248e Binary files /dev/null and b/training/runs/ppo_fix_check/final_model.zip differ diff --git a/training/runs/ppo_fix_check/vecnorm.pkl b/training/runs/ppo_fix_check/vecnorm.pkl new file mode 100644 index 0000000..f51753c Binary files /dev/null and b/training/runs/ppo_fix_check/vecnorm.pkl differ