Behaviour refinement
This commit is contained in:
+35
-7
@@ -35,6 +35,7 @@ class HerdingEnv(gym.Env):
|
|||||||
PEN_X = (10.0, 13.0)
|
PEN_X = (10.0, 13.0)
|
||||||
PEN_Y = (-15.0, -8.0)
|
PEN_Y = (-15.0, -8.0)
|
||||||
PEN_CENTER = np.array([11.5, -11.5], dtype=np.float32)
|
PEN_CENTER = np.array([11.5, -11.5], dtype=np.float32)
|
||||||
|
PEN_ENTRY = np.array([11.5, -8.0], dtype=np.float32) # north entrance face center
|
||||||
|
|
||||||
# -----------------------------------------------------------------------
|
# -----------------------------------------------------------------------
|
||||||
# Dynamics — calibrated to match Webots robot specs
|
# Dynamics — calibrated to match Webots robot specs
|
||||||
@@ -62,6 +63,11 @@ class HerdingEnv(gym.Env):
|
|||||||
W_COMPACT = 0.0 # reward for flock-radius reduction (off by default)
|
W_COMPACT = 0.0 # reward for flock-radius reduction (off by default)
|
||||||
ALIGN_SHAPE = "standoff" # "standoff" (peaks at IDEAL) | "near" (peaks at 0)
|
ALIGN_SHAPE = "standoff" # "standoff" (peaks at IDEAL) | "near" (peaks at 0)
|
||||||
ALIGN_GATED = True # gate alignment on action magnitude
|
ALIGN_GATED = True # gate alignment on action magnitude
|
||||||
|
ENTRY_AWARE = True # progress reward targets PEN_ENTRY (entrance face), not
|
||||||
|
# PEN_CENTER. Stops the wall-corraling exploit: when a
|
||||||
|
# sheep is shoved south past y=-8 outside the pen x-range,
|
||||||
|
# distance to PEN_ENTRY grows (since target is at y=-8),
|
||||||
|
# so progress reward goes negative instead of positive.
|
||||||
|
|
||||||
# Initial sheep spawn: first sheep placed anywhere; rest within CLUSTER_RADIUS
|
# Initial sheep spawn: first sheep placed anywhere; rest within CLUSTER_RADIUS
|
||||||
# of it. Set to None for legacy uniform-scatter behaviour.
|
# of it. Set to None for legacy uniform-scatter behaviour.
|
||||||
@@ -182,10 +188,11 @@ class HerdingEnv(gym.Env):
|
|||||||
|
|
||||||
# Initialise per-sheep pen-distance sum for progress reward
|
# Initialise per-sheep pen-distance sum for progress reward
|
||||||
active = ~self.penned[:self.n_sheep]
|
active = ~self.penned[:self.n_sheep]
|
||||||
|
target = self.PEN_ENTRY if self.ENTRY_AWARE else self.PEN_CENTER
|
||||||
if active.any():
|
if active.any():
|
||||||
self._prev_pen_dist_sum = float(
|
self._prev_pen_dist_sum = float(
|
||||||
np.linalg.norm(
|
np.linalg.norm(
|
||||||
self.sheep_pos[:self.n_sheep][active] - self.PEN_CENTER, axis=1
|
self.sheep_pos[:self.n_sheep][active] - target, axis=1
|
||||||
).sum()
|
).sum()
|
||||||
)
|
)
|
||||||
com0 = self.sheep_pos[:self.n_sheep][active].mean(axis=0)
|
com0 = self.sheep_pos[:self.n_sheep][active].mean(axis=0)
|
||||||
@@ -202,10 +209,26 @@ class HerdingEnv(gym.Env):
|
|||||||
self._step_count += 1
|
self._step_count += 1
|
||||||
|
|
||||||
act = np.clip(np.asarray(action, dtype=np.float32), -1.0, 1.0)
|
act = np.clip(np.asarray(action, dtype=np.float32), -1.0, 1.0)
|
||||||
self.dog_pos = np.clip(
|
old_dog = self.dog_pos.copy()
|
||||||
|
new_dog = np.clip(
|
||||||
self.dog_pos + act * self.DOG_SPEED * self.DT,
|
self.dog_pos + act * self.DOG_SPEED * self.DT,
|
||||||
-self.FIELD, self.FIELD
|
-self.FIELD, self.FIELD
|
||||||
)
|
)
|
||||||
|
# Pen wall collision — mirrors Webots geometry. West (x=PEN_X[0]) and
|
||||||
|
# east (x=PEN_X[1]) walls block the dog within the pen's y-range.
|
||||||
|
# North face (y=PEN_Y[1]=-8) is open. South is the field edge.
|
||||||
|
px0, px1 = self.PEN_X
|
||||||
|
py0, py1 = self.PEN_Y
|
||||||
|
if py0 < new_dog[1] < py1:
|
||||||
|
if old_dog[0] < px0 <= new_dog[0]:
|
||||||
|
new_dog[0] = px0 - 1e-3
|
||||||
|
elif old_dog[0] > px0 >= new_dog[0]:
|
||||||
|
new_dog[0] = px0 + 1e-3
|
||||||
|
if old_dog[0] > px1 >= new_dog[0]:
|
||||||
|
new_dog[0] = px1 + 1e-3
|
||||||
|
elif old_dog[0] < px1 <= new_dog[0]:
|
||||||
|
new_dog[0] = px1 - 1e-3
|
||||||
|
self.dog_pos = new_dog.astype(np.float32)
|
||||||
|
|
||||||
for i in range(self.n_sheep):
|
for i in range(self.n_sheep):
|
||||||
if self.penned[i]:
|
if self.penned[i]:
|
||||||
@@ -325,14 +348,18 @@ class HerdingEnv(gym.Env):
|
|||||||
# For 1 sheep: far1-COM = far2-COM = far3-COM = [0,0] → cleanly ignorable.
|
# For 1 sheep: far1-COM = far2-COM = far3-COM = [0,0] → cleanly ignorable.
|
||||||
# For 3+ sheep: non-zero vectors tell the dog where each straggler is
|
# For 3+ sheep: non-zero vectors tell the dog where each straggler is
|
||||||
# within the group, without conflicting with weights trained on 1 sheep.
|
# within the group, without conflicting with weights trained on 1 sheep.
|
||||||
|
# Pen reference for the policy. Aligned with the reward target so the
|
||||||
|
# policy isn't forced to learn an implicit offset between what it sees
|
||||||
|
# ("pen is here") and what it's rewarded for ("get sheep close to here").
|
||||||
|
pen_ref = self.PEN_ENTRY if self.ENTRY_AWARE else self.PEN_CENTER
|
||||||
return np.array([
|
return np.array([
|
||||||
self.dog_pos[0] / S, self.dog_pos[1] / S,
|
self.dog_pos[0] / S, self.dog_pos[1] / S,
|
||||||
(com[0] - self.dog_pos[0]) / D, (com[1] - self.dog_pos[1]) / D,
|
(com[0] - self.dog_pos[0]) / D, (com[1] - self.dog_pos[1]) / D,
|
||||||
(far1[0] - com[0]) / D, (far1[1] - com[1]) / D,
|
(far1[0] - com[0]) / D, (far1[1] - com[1]) / D,
|
||||||
(far2[0] - com[0]) / D, (far2[1] - com[1]) / D,
|
(far2[0] - com[0]) / D, (far2[1] - com[1]) / D,
|
||||||
(far3[0] - com[0]) / D, (far3[1] - com[1]) / D,
|
(far3[0] - com[0]) / D, (far3[1] - com[1]) / D,
|
||||||
(self.PEN_CENTER[0] - com[0]) / D, (self.PEN_CENTER[1] - com[1]) / D,
|
(pen_ref[0] - com[0]) / D, (pen_ref[1] - com[1]) / D,
|
||||||
(self.PEN_CENTER[0] - far1[0]) / D, (self.PEN_CENTER[1] - far1[1]) / D,
|
(pen_ref[0] - far1[0]) / D, (pen_ref[1] - far1[1]) / D,
|
||||||
radius / D,
|
radius / D,
|
||||||
active_mask.sum() / self.n_sheep,
|
active_mask.sum() / self.n_sheep,
|
||||||
], dtype=np.float32)
|
], dtype=np.float32)
|
||||||
@@ -344,9 +371,10 @@ class HerdingEnv(gym.Env):
|
|||||||
# Naturally rewards keeping the flock together and pushing toward pen:
|
# Naturally rewards keeping the flock together and pushing toward pen:
|
||||||
# dog behind flock → all sheep flee toward pen → all contribute positive reward.
|
# dog behind flock → all sheep flee toward pen → all contribute positive reward.
|
||||||
# Dog from wrong side → sheep scatter away from pen → negative reward.
|
# Dog from wrong side → sheep scatter away from pen → negative reward.
|
||||||
|
target = self.PEN_ENTRY if self.ENTRY_AWARE else self.PEN_CENTER
|
||||||
if active.any():
|
if active.any():
|
||||||
pen_dists = np.linalg.norm(
|
pen_dists = np.linalg.norm(
|
||||||
self.sheep_pos[:self.n_sheep][active] - self.PEN_CENTER, axis=1
|
self.sheep_pos[:self.n_sheep][active] - target, axis=1
|
||||||
)
|
)
|
||||||
cur_sum = float(pen_dists.sum())
|
cur_sum = float(pen_dists.sum())
|
||||||
r_progress = (self._prev_pen_dist_sum - cur_sum) * self.W_PER_SHEEP
|
r_progress = (self._prev_pen_dist_sum - cur_sum) * self.W_PER_SHEEP
|
||||||
@@ -355,10 +383,10 @@ class HerdingEnv(gym.Env):
|
|||||||
r_progress = 0.0
|
r_progress = 0.0
|
||||||
|
|
||||||
com, _, _ = self._flock_stats()
|
com, _, _ = self._flock_stats()
|
||||||
com_dist = float(np.linalg.norm(com - self.PEN_CENTER))
|
com_dist = float(np.linalg.norm(com - target))
|
||||||
d_dog_com = float(np.linalg.norm(self.dog_pos - com))
|
d_dog_com = float(np.linalg.norm(self.dog_pos - com))
|
||||||
if d_dog_com > 0.1 and com_dist > 0.1:
|
if d_dog_com > 0.1 and com_dist > 0.1:
|
||||||
pen_dir = (self.PEN_CENTER - com) / com_dist
|
pen_dir = (target - com) / com_dist
|
||||||
dog_dir = (self.dog_pos - com) / d_dog_com
|
dog_dir = (self.dog_pos - com) / d_dog_com
|
||||||
cosine = -float(np.dot(pen_dir, dog_dir))
|
cosine = -float(np.dot(pen_dir, dog_dir))
|
||||||
if self.ALIGN_SHAPE == "standoff":
|
if self.ALIGN_SHAPE == "standoff":
|
||||||
|
|||||||
@@ -43,6 +43,10 @@ def main():
|
|||||||
p.add_argument("--mixed", action="store_true",
|
p.add_argument("--mixed", action="store_true",
|
||||||
help="Train with n_sheep randomized per episode (no curriculum). "
|
help="Train with n_sheep randomized per episode (no curriculum). "
|
||||||
"Total train steps = steps-per-stage * max_sheep.")
|
"Total train steps = steps-per-stage * max_sheep.")
|
||||||
|
p.add_argument("--final-mixed-steps", type=int, default=0,
|
||||||
|
help="After the curriculum, train this many extra steps with "
|
||||||
|
"random_n_sheep ∈ [1, max_sheep] to consolidate the policy "
|
||||||
|
"across all flock sizes. Re-evaluates all n_sheep at the end.")
|
||||||
p.add_argument("--n-envs", type=int, default=8)
|
p.add_argument("--n-envs", type=int, default=8)
|
||||||
p.add_argument("--max-steps", type=int, default=2500)
|
p.add_argument("--max-steps", type=int, default=2500)
|
||||||
p.add_argument("--eval-episodes", type=int, default=30)
|
p.add_argument("--eval-episodes", type=int, default=30)
|
||||||
@@ -123,6 +127,28 @@ def main():
|
|||||||
f"mean_act={r['mean_act']:.2f}")
|
f"mean_act={r['mean_act']:.2f}")
|
||||||
stage_results.append({"n_sheep": n, **r})
|
stage_results.append({"n_sheep": n, **r})
|
||||||
|
|
||||||
|
# Optional consolidation pass with mixed n_sheep — fixes specialization
|
||||||
|
# imbalance from curriculum order (e.g. n=1 weakness after long n=10
|
||||||
|
# training). Replaces stage_results with the post-consolidation eval.
|
||||||
|
if args.final_mixed_steps > 0 and not args.mixed:
|
||||||
|
print(f"\n[Consolidation] mixed n_sheep ∈ [1, {args.max_sheep}], "
|
||||||
|
f"{args.final_mixed_steps:,} steps")
|
||||||
|
vn.env_method("__setattr__", "random_n_sheep", True)
|
||||||
|
model.learn(
|
||||||
|
total_timesteps=args.final_mixed_steps,
|
||||||
|
reset_num_timesteps=False,
|
||||||
|
callback=ProgressCallback(0, "consolidate", freq=100_000),
|
||||||
|
)
|
||||||
|
print("[Consolidation] re-evaluating all sheep counts")
|
||||||
|
stage_results = []
|
||||||
|
for n in range(1, args.max_sheep + 1):
|
||||||
|
r = evaluate(model, vn, n, args.eval_episodes, args.max_steps, rcfg)
|
||||||
|
print(f"[Consolidation] n_sheep={n} sr={r['sr']*100:.0f}% "
|
||||||
|
f"mean_len={r['mean_len']:.0f} "
|
||||||
|
f"mean_min_pen={r['mean_min_pen']:.1f}m "
|
||||||
|
f"mean_act={r['mean_act']:.2f}")
|
||||||
|
stage_results.append({"n_sheep": n, **r})
|
||||||
|
|
||||||
model.save(os.path.join(run_dir, "final_model"))
|
model.save(os.path.join(run_dir, "final_model"))
|
||||||
vn.save(os.path.join(run_dir, "vecnorm.pkl"))
|
vn.save(os.path.join(run_dir, "vecnorm.pkl"))
|
||||||
with open(os.path.join(run_dir, "stage_results.json"), "w") as f:
|
with open(os.path.join(run_dir, "stage_results.json"), "w") as f:
|
||||||
|
|||||||
Reference in New Issue
Block a user