Approach v3 w/ south penalty

This commit is contained in:
Johnny Fernandes
2026-04-26 14:55:13 +01:00
parent a561f8a697
commit 11e13c6980
19 changed files with 6549 additions and 3 deletions

Before

Width:  |  Height:  |  Size: 146 KiB

After

Width:  |  Height:  |  Size: 146 KiB

Before

Width:  |  Height:  |  Size: 233 KiB

After

Width:  |  Height:  |  Size: 233 KiB

Before

Width:  |  Height:  |  Size: 74 KiB

After

Width:  |  Height:  |  Size: 74 KiB

Before

Width:  |  Height:  |  Size: 194 KiB

After

Width:  |  Height:  |  Size: 194 KiB

Before

Width:  |  Height:  |  Size: 233 KiB

After

Width:  |  Height:  |  Size: 233 KiB

Before

Width:  |  Height:  |  Size: 72 KiB

After

Width:  |  Height:  |  Size: 72 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 158 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 220 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 76 KiB

Can't render this file because it is too large.
Can't render this file because it is too large.
File diff suppressed because it is too large Load Diff
Binary file not shown.
+15 -2
View File
@@ -59,6 +59,13 @@ VECNORM_PATH = os.path.join(_HERE, "vecnorm.pkl")
DEBUG_CSV = os.path.join(_HERE, "debug.csv") DEBUG_CSV = os.path.join(_HERE, "debug.csv")
DEBUG_ENABLED = True # set False to disable debug.csv logging DEBUG_ENABLED = True # set False to disable debug.csv logging
# ── action smoothing ─────────────────────────────────────────────────────────
# EMA on policy output to suppress the rapid oscillation (vx/vy flipping
# between -1 and +1 every step) that stalls the physical dog. 0 = no
# smoothing (raw policy), 1 = frozen. 0.3 keeps ~30% of previous action.
ACTION_SMOOTH = 0.3
prev_action = np.zeros(2, dtype=np.float32)
def norm_angle(a: float) -> float: def norm_angle(a: float) -> float:
while a > math.pi: a -= 2 * math.pi while a > math.pi: a -= 2 * math.pi
@@ -227,9 +234,15 @@ while robot.step(timestep) != -1:
raw_obs = build_obs(dog_pos, sheep_positions, n_sheep) raw_obs = build_obs(dog_pos, sheep_positions, n_sheep)
obs_norm = vecnorm.normalize_obs(raw_obs[np.newaxis]) # (1, 13) obs_norm = vecnorm.normalize_obs(raw_obs[np.newaxis]) # (1, 13)
# 4. Policy inference # 4. Policy inference + smoothing
action, _ = model.predict(obs_norm, deterministic=True) action, _ = model.predict(obs_norm, deterministic=True)
vx, vy = float(action[0][0]), float(action[0][1]) raw_a = np.array([float(action[0][0]), float(action[0][1])], dtype=np.float32)
if ACTION_SMOOTH > 0:
smoothed = ACTION_SMOOTH * prev_action + (1.0 - ACTION_SMOOTH) * raw_a
prev_action[:] = smoothed
vx, vy = float(smoothed[0]), float(smoothed[1])
else:
vx, vy = float(raw_a[0]), float(raw_a[1])
# 5. Drive # 5. Drive
drive(vx, vy) drive(vx, vy)
Binary file not shown.
+17 -1
View File
@@ -60,6 +60,11 @@ class HerdingEnv(gym.Env):
W_PEN_BONUS = 10.0 # per sheep penned W_PEN_BONUS = 10.0 # per sheep penned
W_COMPLETE = 100.0 # all sheep penned W_COMPLETE = 100.0 # all sheep penned
W_STEP_COST = 0.02 # time penalty — strong enough to punish doing nothing W_STEP_COST = 0.02 # time penalty — strong enough to punish doing nothing
W_SOUTH = 0.01 # per-sheep per-metre penalty for active sheep below the pen
# entrance (y < PEN_Y[1]=-8). Keeps the dog from letting
# sheep drift into the dead zone below the open face where
# they must reverse direction (north) to enter — hard to
# recover. 0.01 ≈ half step_cost per metre below per sheep.
W_COMPACT = 0.0 # reward for flock-radius reduction (off by default) W_COMPACT = 0.0 # reward for flock-radius reduction (off by default)
W_WALL_TOUCH = 0.01 # per-sheep max penalty at wall surface. Linear ramp W_WALL_TOUCH = 0.01 # per-sheep max penalty at wall surface. Linear ramp
# within WALL_TOUCH_BUFFER. Covers field outer walls and # within WALL_TOUCH_BUFFER. Covers field outer walls and
@@ -437,6 +442,16 @@ class HerdingEnv(gym.Env):
else: else:
r_wall_touch = 0.0 r_wall_touch = 0.0
# South penalty: discourage active sheep from drifting below the pen
# entrance (y < PEN_Y[1]). Sheep in this zone must reverse direction
# (move north) to enter — very hard for the dog to recover from.
if self.W_SOUTH and active.any():
pts = self.sheep_pos[:self.n_sheep][active]
depth = np.maximum(0.0, self.PEN_Y[1] - pts[:, 1]) # metres below entrance
r_south = -float(depth.sum()) * self.W_SOUTH
else:
r_south = 0.0
# Compactness shaping: reward decreases in flock radius (active sheep only) # Compactness shaping: reward decreases in flock radius (active sheep only)
if self.W_COMPACT and active.any(): if self.W_COMPACT and active.any():
cur_radius = float(np.linalg.norm( cur_radius = float(np.linalg.norm(
@@ -450,11 +465,12 @@ class HerdingEnv(gym.Env):
r_pen_bonus = newly_penned * self.W_PEN_BONUS r_pen_bonus = newly_penned * self.W_PEN_BONUS
r_step_cost = -self.W_STEP_COST r_step_cost = -self.W_STEP_COST
r_complete = self.W_COMPLETE if n_penned == self.n_sheep else 0.0 r_complete = self.W_COMPLETE if n_penned == self.n_sheep else 0.0
reward = (r_progress + alignment + r_compact + r_wall_touch reward = (r_progress + alignment + r_south + r_compact + r_wall_touch
+ r_pen_bonus + r_step_cost + r_complete) + r_pen_bonus + r_step_cost + r_complete)
rcomps = { rcomps = {
"progress": float(r_progress), "progress": float(r_progress),
"alignment": float(alignment), "alignment": float(alignment),
"south": float(r_south),
"compact": float(r_compact), "compact": float(r_compact),
"wall_touch": float(r_wall_touch), "wall_touch": float(r_wall_touch),
"pen_bonus": float(r_pen_bonus), "pen_bonus": float(r_pen_bonus),
+1
View File
@@ -200,6 +200,7 @@ DEFAULT_CONFIG = {
"W_PEN_BONUS": 10.0, "W_PEN_BONUS": 10.0,
"W_COMPLETE": 100.0, "W_COMPLETE": 100.0,
"W_STEP_COST": 0.02, "W_STEP_COST": 0.02,
"W_SOUTH": 0.01,
"W_COMPACT": 0.0, "W_COMPACT": 0.0,
"W_WALL_TOUCH": 0.04, "W_WALL_TOUCH": 0.04,
"WALL_TOUCH_BUFFER": 0.3, "WALL_TOUCH_BUFFER": 0.3,