Sheep training flock of 10 fix?

This commit is contained in:
Johnny Fernandes
2026-04-24 19:03:18 +01:00
parent 3bac24f406
commit 7bb545eab6
+4 -3
View File
@@ -191,11 +191,12 @@ class DiagnosticCallback(BaseCallback):
for m, c in sorted(failure_counts.items(), key=lambda x: -x[1]):
print(f" {m:<26} {c}/{self.n_episodes}")
# Stall detection: same dominant failure at same n_sheep twice in a row
# Stall detection: same dominant failure at same n_sheep 5 checks in a row,
# and only after 3M total steps (give early stages time to warm up).
key = (n_sheep, dominant)
if key == self._prev_dominant and dominant != "SUCCESS":
self._stall_count += 1
if self._stall_count >= 2:
if self._stall_count >= 5 and self.num_timesteps >= 3_000_000:
print(f"\n[Diag] STALL DETECTED — '{dominant}' on {n_sheep} sheep "
f"for {self._stall_count} consecutive checks. "
f"Aborting training early.")
@@ -302,7 +303,7 @@ def main():
verbose=1,
)
diag_cb = DiagnosticCallback(
diag_freq=max(args.diag_freq // args.n_envs, 1),
diag_freq=args.diag_freq,
n_episodes=20,
max_steps=args.max_steps,
)