Checkpoint 5 - incomplete

This commit is contained in:
Johnny Fernandes
2026-05-11 10:35:39 +01:00
parent 6688325d89
commit b457155538
13 changed files with 174 additions and 74 deletions
+9
View File
@@ -204,6 +204,12 @@ class HerdingEnv(gym.Env):
already mimics a stronger teacher (sequential)."""
self.W_IMITATE = float(value)
def set_time_weight(self, value: float) -> None:
"""Override W_TIME (instance-level). Default 0.0; a small
negative value (e.g. -0.1) adds a per-step penalty that
explicitly rewards fast time-to-pen during PPO fine-tune."""
self.W_TIME = float(value)
# ---- gym API ----
def reset(self, *, seed=None, options=None):
super().reset(seed=seed)
@@ -431,6 +437,9 @@ class HerdingEnv(gym.Env):
d_progress = max(-5.0, min(5.0, self.prev_d_pen - d_pen))
r = self.W_PEN_DELTA * delta_pen + self.W_PROGRESS * d_progress
# Per-step time penalty (0 by default). When negative, encourages
# the policy to finish quickly — used during PPO fine-tune.
r += self.W_TIME
if action is not None and self.W_IMITATE > 0.0:
positions = self._perceived_positions()