Dog rewarding adjustment
This commit is contained in:
+48
-14
@@ -52,8 +52,9 @@ class HerdingEnv(gym.Env):
|
||||
# -----------------------------------------------------------------------
|
||||
# Reward weights
|
||||
# -----------------------------------------------------------------------
|
||||
W_APPROACH = 0.3 # dense: dog distance to nearest active sheep
|
||||
W_SHAPING = 0.5 # dense: mean sheep distance to pen (was 0.01)
|
||||
W_ALIGN = 0.4 # dense: dog on anti-pen side of each active sheep
|
||||
W_SHAPING = 0.5 # dense: mean sheep distance to pen
|
||||
W_APPROACH = 0.1 # dense: dog within flee range of nearest sheep
|
||||
W_PEN_BONUS = 5.0 # sparse: per sheep successfully penned
|
||||
W_COMPLETE = 20.0 # bonus when ALL active sheep are penned
|
||||
W_STEP_COST = 0.002 # penalty per step (encourages efficiency)
|
||||
@@ -106,11 +107,8 @@ class HerdingEnv(gym.Env):
|
||||
self._step_count = 0
|
||||
self._prev_penned = 0
|
||||
|
||||
# Dog: random start in the open field (not near the pen)
|
||||
self.dog_pos = self.np_random.uniform(-8.0, 5.0, size=(2,)).astype(np.float32)
|
||||
|
||||
# Active sheep (0 .. n_sheep-1): random non-pen positions
|
||||
self.sheep_pos[:] = self.PEN_CENTER # default all to pen centre
|
||||
self.sheep_pos[:] = self.PEN_CENTER
|
||||
self.penned[:] = True
|
||||
|
||||
placed = 0
|
||||
@@ -121,6 +119,25 @@ class HerdingEnv(gym.Env):
|
||||
self.penned[placed] = False
|
||||
placed += 1
|
||||
|
||||
# Dog: 50 % of the time start already on the anti-pen side of the
|
||||
# nearest sheep (within flee range) so early training gets aligned
|
||||
# starts; the other 50 % is fully random to ensure generalisation.
|
||||
if self.np_random.random() < 0.5:
|
||||
# Place dog behind the first active sheep relative to the pen
|
||||
ref = self.sheep_pos[0]
|
||||
away = ref - self.PEN_CENTER # sheep→anti-pen
|
||||
dist = float(np.linalg.norm(away))
|
||||
if dist > 0.1:
|
||||
away = away / dist
|
||||
offset = away * self.np_random.uniform(2.0, self.FLEE_DIST * 0.8)
|
||||
self.dog_pos = np.clip(
|
||||
(ref + offset).astype(np.float32), -self.FIELD, self.FIELD
|
||||
)
|
||||
else:
|
||||
self.dog_pos = self.np_random.uniform(
|
||||
-self.FIELD * 0.8, self.FIELD * 0.8, size=(2,)
|
||||
).astype(np.float32)
|
||||
|
||||
# Inactive slots (n_sheep .. MAX_SHEEP-1): already at pen centre, penned=True
|
||||
|
||||
self.wander_ang = self.np_random.uniform(
|
||||
@@ -237,20 +254,37 @@ class HerdingEnv(gym.Env):
|
||||
active_mask = ~self.penned[:self.n_sheep]
|
||||
if active_mask.any():
|
||||
active_pos = self.sheep_pos[:self.n_sheep][active_mask]
|
||||
|
||||
# Sheep-to-pen shaping: encourages moving sheep toward pen
|
||||
dists_pen = np.linalg.norm(active_pos - self.PEN_CENTER, axis=1)
|
||||
shaping = -(dists_pen.mean() / (2 * self.FIELD)) # ∈ [-1, 0]
|
||||
|
||||
# Dog-to-nearest-sheep approach: incentivises the dog to stay
|
||||
# within flee range (FLEE_DIST=7m) rather than wandering away
|
||||
dists_dog = np.linalg.norm(active_pos - self.dog_pos, axis=1)
|
||||
approach = -(dists_dog.min() / (2 * self.FIELD)) # ∈ [-1, 0]
|
||||
|
||||
# Sheep-to-pen shaping
|
||||
shaping = -(dists_pen.mean() / (2 * self.FIELD))
|
||||
|
||||
# Approach: dog penalised for being far from nearest sheep
|
||||
approach = -(dists_dog.min() / (2 * self.FIELD))
|
||||
|
||||
# Alignment: reward dog for being on the anti-pen side of each sheep.
|
||||
# When the dog is opposite the pen relative to a sheep, that sheep
|
||||
# flees toward the pen. Score ∈ [-1, 1] per sheep, weighted by
|
||||
# a proximity gate so only nearby dogs count.
|
||||
align_scores = []
|
||||
for s_pos, d_pen, d_dog in zip(active_pos, dists_pen, dists_dog):
|
||||
if d_pen < 0.1 or d_dog < 0.1:
|
||||
continue
|
||||
pen_dir = (self.PEN_CENTER - s_pos) / d_pen # sheep → pen
|
||||
dog_dir = (self.dog_pos - s_pos) / d_dog # sheep → dog
|
||||
# cos(angle): +1 → dog behind sheep, -1 → dog on pen side
|
||||
cosine = -float(np.dot(pen_dir, dog_dir))
|
||||
# gate: full credit inside flee range, fades beyond
|
||||
proximity = max(0.0, 1.0 - d_dog / self.FLEE_DIST)
|
||||
align_scores.append(cosine * proximity)
|
||||
alignment = float(np.mean(align_scores)) if align_scores else 0.0
|
||||
else:
|
||||
shaping = approach = 0.0
|
||||
shaping = approach = alignment = 0.0
|
||||
|
||||
reward = shaping * self.W_SHAPING
|
||||
reward += approach * self.W_APPROACH
|
||||
reward += alignment * self.W_ALIGN
|
||||
reward += newly_penned * self.W_PEN_BONUS
|
||||
reward -= self.W_STEP_COST
|
||||
if n_penned == self.n_sheep:
|
||||
|
||||
Reference in New Issue
Block a user