Dog rewarding adjustment

This commit is contained in:
Johnny Fernandes
2026-04-23 11:35:15 +01:00
parent 00eaf47d1f
commit f9c5093211
+50 -16
View File
@@ -52,8 +52,9 @@ class HerdingEnv(gym.Env):
# -----------------------------------------------------------------------
# Reward weights
# -----------------------------------------------------------------------
W_APPROACH = 0.3 # dense: dog distance to nearest active sheep
W_SHAPING = 0.5 # dense: mean sheep distance to pen (was 0.01)
W_ALIGN = 0.4 # dense: dog on anti-pen side of each active sheep
W_SHAPING = 0.5 # dense: mean sheep distance to pen
W_APPROACH = 0.1 # dense: dog within flee range of nearest sheep
W_PEN_BONUS = 5.0 # sparse: per sheep successfully penned
W_COMPLETE = 20.0 # bonus when ALL active sheep are penned
W_STEP_COST = 0.002 # penalty per step (encourages efficiency)
@@ -106,11 +107,8 @@ class HerdingEnv(gym.Env):
self._step_count = 0
self._prev_penned = 0
# Dog: random start in the open field (not near the pen)
self.dog_pos = self.np_random.uniform(-8.0, 5.0, size=(2,)).astype(np.float32)
# Active sheep (0 .. n_sheep-1): random non-pen positions
self.sheep_pos[:] = self.PEN_CENTER # default all to pen centre
self.sheep_pos[:] = self.PEN_CENTER
self.penned[:] = True
placed = 0
@@ -121,6 +119,25 @@ class HerdingEnv(gym.Env):
self.penned[placed] = False
placed += 1
# Dog: 50 % of the time start already on the anti-pen side of the
# nearest sheep (within flee range) so early training gets aligned
# starts; the other 50 % is fully random to ensure generalisation.
if self.np_random.random() < 0.5:
# Place dog behind the first active sheep relative to the pen
ref = self.sheep_pos[0]
away = ref - self.PEN_CENTER # sheep→anti-pen
dist = float(np.linalg.norm(away))
if dist > 0.1:
away = away / dist
offset = away * self.np_random.uniform(2.0, self.FLEE_DIST * 0.8)
self.dog_pos = np.clip(
(ref + offset).astype(np.float32), -self.FIELD, self.FIELD
)
else:
self.dog_pos = self.np_random.uniform(
-self.FIELD * 0.8, self.FIELD * 0.8, size=(2,)
).astype(np.float32)
# Inactive slots (n_sheep .. MAX_SHEEP-1): already at pen centre, penned=True
self.wander_ang = self.np_random.uniform(
@@ -237,20 +254,37 @@ class HerdingEnv(gym.Env):
active_mask = ~self.penned[:self.n_sheep]
if active_mask.any():
active_pos = self.sheep_pos[:self.n_sheep][active_mask]
dists_pen = np.linalg.norm(active_pos - self.PEN_CENTER, axis=1)
dists_dog = np.linalg.norm(active_pos - self.dog_pos, axis=1)
# Sheep-to-pen shaping: encourages moving sheep toward pen
dists_pen = np.linalg.norm(active_pos - self.PEN_CENTER, axis=1)
shaping = -(dists_pen.mean() / (2 * self.FIELD)) # ∈ [-1, 0]
# Sheep-to-pen shaping
shaping = -(dists_pen.mean() / (2 * self.FIELD))
# Dog-to-nearest-sheep approach: incentivises the dog to stay
# within flee range (FLEE_DIST=7m) rather than wandering away
dists_dog = np.linalg.norm(active_pos - self.dog_pos, axis=1)
approach = -(dists_dog.min() / (2 * self.FIELD)) # ∈ [-1, 0]
# Approach: dog penalised for being far from nearest sheep
approach = -(dists_dog.min() / (2 * self.FIELD))
# Alignment: reward dog for being on the anti-pen side of each sheep.
# When the dog is opposite the pen relative to a sheep, that sheep
# flees toward the pen. Score ∈ [-1, 1] per sheep, weighted by
# a proximity gate so only nearby dogs count.
align_scores = []
for s_pos, d_pen, d_dog in zip(active_pos, dists_pen, dists_dog):
if d_pen < 0.1 or d_dog < 0.1:
continue
pen_dir = (self.PEN_CENTER - s_pos) / d_pen # sheep → pen
dog_dir = (self.dog_pos - s_pos) / d_dog # sheep → dog
# cos(angle): +1 → dog behind sheep, -1 → dog on pen side
cosine = -float(np.dot(pen_dir, dog_dir))
# gate: full credit inside flee range, fades beyond
proximity = max(0.0, 1.0 - d_dog / self.FLEE_DIST)
align_scores.append(cosine * proximity)
alignment = float(np.mean(align_scores)) if align_scores else 0.0
else:
shaping = approach = 0.0
shaping = approach = alignment = 0.0
reward = shaping * self.W_SHAPING
reward += approach * self.W_APPROACH
reward = shaping * self.W_SHAPING
reward += approach * self.W_APPROACH
reward += alignment * self.W_ALIGN
reward += newly_penned * self.W_PEN_BONUS
reward -= self.W_STEP_COST
if n_penned == self.n_sheep: