Approach v3 w/ south penalty

2026-04-26 14:55:13 +01:00
parent a561f8a697
commit 11e13c6980
19 changed files with 6549 additions and 3 deletions
@@ -59,6 +59,13 @@ VECNORM_PATH = os.path.join(_HERE, "vecnorm.pkl")
 DEBUG_CSV    = os.path.join(_HERE, "debug.csv")
 DEBUG_ENABLED = True   # set False to disable debug.csv logging
 # ── action smoothing ─────────────────────────────────────────────────────────
 # EMA on policy output to suppress the rapid oscillation (vx/vy flipping
 # between -1 and +1 every step) that stalls the physical dog.  0 = no
 # smoothing (raw policy), 1 = frozen.  0.3 keeps ~30% of previous action.
 ACTION_SMOOTH = 0.3
 prev_action   = np.zeros(2, dtype=np.float32)
 def norm_angle(a: float) -> float:
    while a >  math.pi: a -= 2 * math.pi
@@ -227,9 +234,15 @@ while robot.step(timestep) != -1:
    raw_obs  = build_obs(dog_pos, sheep_positions, n_sheep)
    obs_norm = vecnorm.normalize_obs(raw_obs[np.newaxis])  # (1, 13)
-    # 4. Policy inference
+    # 4. Policy inference + smoothing
    action, _ = model.predict(obs_norm, deterministic=True)
-    vx, vy    = float(action[0][0]), float(action[0][1])
+    raw_a = np.array([float(action[0][0]), float(action[0][1])], dtype=np.float32)
    if ACTION_SMOOTH > 0:
        smoothed = ACTION_SMOOTH * prev_action + (1.0 - ACTION_SMOOTH) * raw_a
        prev_action[:] = smoothed
        vx, vy = float(smoothed[0]), float(smoothed[1])
    else:
        vx, vy = float(raw_a[0]), float(raw_a[1])
    # 5. Drive
    drive(vx, vy)
@@ -60,6 +60,11 @@ class HerdingEnv(gym.Env):
    W_PEN_BONUS = 10.0   # per sheep penned
    W_COMPLETE  = 100.0  # all sheep penned
    W_STEP_COST = 0.02   # time penalty — strong enough to punish doing nothing
    W_SOUTH     = 0.01   # per-sheep per-metre penalty for active sheep below the pen
                         # entrance (y < PEN_Y[1]=-8). Keeps the dog from letting
                         # sheep drift into the dead zone below the open face where
                         # they must reverse direction (north) to enter — hard to
                         # recover. 0.01 ≈ half step_cost per metre below per sheep.
    W_COMPACT   = 0.0    # reward for flock-radius reduction (off by default)
    W_WALL_TOUCH = 0.01  # per-sheep max penalty at wall surface. Linear ramp
                         # within WALL_TOUCH_BUFFER. Covers field outer walls and
@@ -437,6 +442,16 @@ class HerdingEnv(gym.Env):
        else:
            r_wall_touch = 0.0
        # South penalty: discourage active sheep from drifting below the pen
        # entrance (y < PEN_Y[1]). Sheep in this zone must reverse direction
        # (move north) to enter — very hard for the dog to recover from.
        if self.W_SOUTH and active.any():
            pts = self.sheep_pos[:self.n_sheep][active]
            depth = np.maximum(0.0, self.PEN_Y[1] - pts[:, 1])  # metres below entrance
            r_south = -float(depth.sum()) * self.W_SOUTH
        else:
            r_south = 0.0
        # Compactness shaping: reward decreases in flock radius (active sheep only)
        if self.W_COMPACT and active.any():
            cur_radius = float(np.linalg.norm(
@@ -450,11 +465,12 @@ class HerdingEnv(gym.Env):
        r_pen_bonus  = newly_penned * self.W_PEN_BONUS
        r_step_cost  = -self.W_STEP_COST
        r_complete   = self.W_COMPLETE if n_penned == self.n_sheep else 0.0
-        reward = (r_progress + alignment + r_compact + r_wall_touch
+        reward = (r_progress + alignment + r_south + r_compact + r_wall_touch
                  + r_pen_bonus + r_step_cost + r_complete)
        rcomps = {
            "progress":   float(r_progress),
            "alignment":  float(alignment),
            "south":      float(r_south),
            "compact":    float(r_compact),
            "wall_touch": float(r_wall_touch),
            "pen_bonus":  float(r_pen_bonus),
@@ -200,6 +200,7 @@ DEFAULT_CONFIG = {
    "W_PEN_BONUS": 10.0,
    "W_COMPLETE": 100.0,
    "W_STEP_COST": 0.02,
    "W_SOUTH": 0.01,
    "W_COMPACT": 0.0,
    "W_WALL_TOUCH": 0.04,
    "WALL_TOUCH_BUFFER": 0.3,