Checkpoint 5 - incomplete
This commit is contained in:
@@ -204,6 +204,12 @@ class HerdingEnv(gym.Env):
|
||||
already mimics a stronger teacher (sequential)."""
|
||||
self.W_IMITATE = float(value)
|
||||
|
||||
def set_time_weight(self, value: float) -> None:
|
||||
"""Override W_TIME (instance-level). Default 0.0; a small
|
||||
negative value (e.g. -0.1) adds a per-step penalty that
|
||||
explicitly rewards fast time-to-pen during PPO fine-tune."""
|
||||
self.W_TIME = float(value)
|
||||
|
||||
# ---- gym API ----
|
||||
def reset(self, *, seed=None, options=None):
|
||||
super().reset(seed=seed)
|
||||
@@ -431,6 +437,9 @@ class HerdingEnv(gym.Env):
|
||||
|
||||
d_progress = max(-5.0, min(5.0, self.prev_d_pen - d_pen))
|
||||
r = self.W_PEN_DELTA * delta_pen + self.W_PROGRESS * d_progress
|
||||
# Per-step time penalty (0 by default). When negative, encourages
|
||||
# the policy to finish quickly — used during PPO fine-tune.
|
||||
r += self.W_TIME
|
||||
|
||||
if action is not None and self.W_IMITATE > 0.0:
|
||||
positions = self._perceived_positions()
|
||||
|
||||
Reference in New Issue
Block a user