Dog rewarding adjustment
This commit is contained in:
+48
-14
@@ -52,8 +52,9 @@ class HerdingEnv(gym.Env):
|
|||||||
# -----------------------------------------------------------------------
|
# -----------------------------------------------------------------------
|
||||||
# Reward weights
|
# Reward weights
|
||||||
# -----------------------------------------------------------------------
|
# -----------------------------------------------------------------------
|
||||||
W_APPROACH = 0.3 # dense: dog distance to nearest active sheep
|
W_ALIGN = 0.4 # dense: dog on anti-pen side of each active sheep
|
||||||
W_SHAPING = 0.5 # dense: mean sheep distance to pen (was 0.01)
|
W_SHAPING = 0.5 # dense: mean sheep distance to pen
|
||||||
|
W_APPROACH = 0.1 # dense: dog within flee range of nearest sheep
|
||||||
W_PEN_BONUS = 5.0 # sparse: per sheep successfully penned
|
W_PEN_BONUS = 5.0 # sparse: per sheep successfully penned
|
||||||
W_COMPLETE = 20.0 # bonus when ALL active sheep are penned
|
W_COMPLETE = 20.0 # bonus when ALL active sheep are penned
|
||||||
W_STEP_COST = 0.002 # penalty per step (encourages efficiency)
|
W_STEP_COST = 0.002 # penalty per step (encourages efficiency)
|
||||||
@@ -106,11 +107,8 @@ class HerdingEnv(gym.Env):
|
|||||||
self._step_count = 0
|
self._step_count = 0
|
||||||
self._prev_penned = 0
|
self._prev_penned = 0
|
||||||
|
|
||||||
# Dog: random start in the open field (not near the pen)
|
|
||||||
self.dog_pos = self.np_random.uniform(-8.0, 5.0, size=(2,)).astype(np.float32)
|
|
||||||
|
|
||||||
# Active sheep (0 .. n_sheep-1): random non-pen positions
|
# Active sheep (0 .. n_sheep-1): random non-pen positions
|
||||||
self.sheep_pos[:] = self.PEN_CENTER # default all to pen centre
|
self.sheep_pos[:] = self.PEN_CENTER
|
||||||
self.penned[:] = True
|
self.penned[:] = True
|
||||||
|
|
||||||
placed = 0
|
placed = 0
|
||||||
@@ -121,6 +119,25 @@ class HerdingEnv(gym.Env):
|
|||||||
self.penned[placed] = False
|
self.penned[placed] = False
|
||||||
placed += 1
|
placed += 1
|
||||||
|
|
||||||
|
# Dog: 50 % of the time start already on the anti-pen side of the
|
||||||
|
# nearest sheep (within flee range) so early training gets aligned
|
||||||
|
# starts; the other 50 % is fully random to ensure generalisation.
|
||||||
|
if self.np_random.random() < 0.5:
|
||||||
|
# Place dog behind the first active sheep relative to the pen
|
||||||
|
ref = self.sheep_pos[0]
|
||||||
|
away = ref - self.PEN_CENTER # sheep→anti-pen
|
||||||
|
dist = float(np.linalg.norm(away))
|
||||||
|
if dist > 0.1:
|
||||||
|
away = away / dist
|
||||||
|
offset = away * self.np_random.uniform(2.0, self.FLEE_DIST * 0.8)
|
||||||
|
self.dog_pos = np.clip(
|
||||||
|
(ref + offset).astype(np.float32), -self.FIELD, self.FIELD
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.dog_pos = self.np_random.uniform(
|
||||||
|
-self.FIELD * 0.8, self.FIELD * 0.8, size=(2,)
|
||||||
|
).astype(np.float32)
|
||||||
|
|
||||||
# Inactive slots (n_sheep .. MAX_SHEEP-1): already at pen centre, penned=True
|
# Inactive slots (n_sheep .. MAX_SHEEP-1): already at pen centre, penned=True
|
||||||
|
|
||||||
self.wander_ang = self.np_random.uniform(
|
self.wander_ang = self.np_random.uniform(
|
||||||
@@ -237,20 +254,37 @@ class HerdingEnv(gym.Env):
|
|||||||
active_mask = ~self.penned[:self.n_sheep]
|
active_mask = ~self.penned[:self.n_sheep]
|
||||||
if active_mask.any():
|
if active_mask.any():
|
||||||
active_pos = self.sheep_pos[:self.n_sheep][active_mask]
|
active_pos = self.sheep_pos[:self.n_sheep][active_mask]
|
||||||
|
|
||||||
# Sheep-to-pen shaping: encourages moving sheep toward pen
|
|
||||||
dists_pen = np.linalg.norm(active_pos - self.PEN_CENTER, axis=1)
|
dists_pen = np.linalg.norm(active_pos - self.PEN_CENTER, axis=1)
|
||||||
shaping = -(dists_pen.mean() / (2 * self.FIELD)) # ∈ [-1, 0]
|
|
||||||
|
|
||||||
# Dog-to-nearest-sheep approach: incentivises the dog to stay
|
|
||||||
# within flee range (FLEE_DIST=7m) rather than wandering away
|
|
||||||
dists_dog = np.linalg.norm(active_pos - self.dog_pos, axis=1)
|
dists_dog = np.linalg.norm(active_pos - self.dog_pos, axis=1)
|
||||||
approach = -(dists_dog.min() / (2 * self.FIELD)) # ∈ [-1, 0]
|
|
||||||
|
# Sheep-to-pen shaping
|
||||||
|
shaping = -(dists_pen.mean() / (2 * self.FIELD))
|
||||||
|
|
||||||
|
# Approach: dog penalised for being far from nearest sheep
|
||||||
|
approach = -(dists_dog.min() / (2 * self.FIELD))
|
||||||
|
|
||||||
|
# Alignment: reward dog for being on the anti-pen side of each sheep.
|
||||||
|
# When the dog is opposite the pen relative to a sheep, that sheep
|
||||||
|
# flees toward the pen. Score ∈ [-1, 1] per sheep, weighted by
|
||||||
|
# a proximity gate so only nearby dogs count.
|
||||||
|
align_scores = []
|
||||||
|
for s_pos, d_pen, d_dog in zip(active_pos, dists_pen, dists_dog):
|
||||||
|
if d_pen < 0.1 or d_dog < 0.1:
|
||||||
|
continue
|
||||||
|
pen_dir = (self.PEN_CENTER - s_pos) / d_pen # sheep → pen
|
||||||
|
dog_dir = (self.dog_pos - s_pos) / d_dog # sheep → dog
|
||||||
|
# cos(angle): +1 → dog behind sheep, -1 → dog on pen side
|
||||||
|
cosine = -float(np.dot(pen_dir, dog_dir))
|
||||||
|
# gate: full credit inside flee range, fades beyond
|
||||||
|
proximity = max(0.0, 1.0 - d_dog / self.FLEE_DIST)
|
||||||
|
align_scores.append(cosine * proximity)
|
||||||
|
alignment = float(np.mean(align_scores)) if align_scores else 0.0
|
||||||
else:
|
else:
|
||||||
shaping = approach = 0.0
|
shaping = approach = alignment = 0.0
|
||||||
|
|
||||||
reward = shaping * self.W_SHAPING
|
reward = shaping * self.W_SHAPING
|
||||||
reward += approach * self.W_APPROACH
|
reward += approach * self.W_APPROACH
|
||||||
|
reward += alignment * self.W_ALIGN
|
||||||
reward += newly_penned * self.W_PEN_BONUS
|
reward += newly_penned * self.W_PEN_BONUS
|
||||||
reward -= self.W_STEP_COST
|
reward -= self.W_STEP_COST
|
||||||
if n_penned == self.n_sheep:
|
if n_penned == self.n_sheep:
|
||||||
|
|||||||
Reference in New Issue
Block a user