Sheep training flock of 10 fix?
This commit is contained in:
@@ -80,14 +80,15 @@ def build_obs(dog_pos: np.ndarray,
|
|||||||
n_active = len(active_pos)
|
n_active = len(active_pos)
|
||||||
|
|
||||||
if n_active > 0:
|
if n_active > 0:
|
||||||
com = active_pos.mean(axis=0)
|
com = active_pos.mean(axis=0)
|
||||||
d_from_com = np.linalg.norm(active_pos - com, axis=1)
|
d_from_com = np.linalg.norm(active_pos - com, axis=1)
|
||||||
radius = float(d_from_com.max())
|
sorted_idx = np.argsort(d_from_com)[::-1]
|
||||||
mean_disp = float(d_from_com.mean())
|
radius = float(d_from_com[sorted_idx[0]])
|
||||||
far = active_pos[int(np.argmax(d_from_com))]
|
far = active_pos[sorted_idx[0]]
|
||||||
|
second_far_dist = float(d_from_com[sorted_idx[1]]) if len(sorted_idx) > 1 else 0.0
|
||||||
else:
|
else:
|
||||||
com = PEN_CENTER.copy()
|
com = PEN_CENTER.copy()
|
||||||
radius = mean_disp = 0.0
|
radius = second_far_dist = 0.0
|
||||||
far = PEN_CENTER.copy()
|
far = PEN_CENTER.copy()
|
||||||
|
|
||||||
frac_active = n_active / max(n_sheep, 1)
|
frac_active = n_active / max(n_sheep, 1)
|
||||||
@@ -98,8 +99,8 @@ def build_obs(dog_pos: np.ndarray,
|
|||||||
(far[0] - dog_pos[0]) / D, (far[1] - dog_pos[1]) / D,
|
(far[0] - dog_pos[0]) / D, (far[1] - dog_pos[1]) / D,
|
||||||
(PEN_CENTER[0] - com[0]) / D, (PEN_CENTER[1] - com[1]) / D,
|
(PEN_CENTER[0] - com[0]) / D, (PEN_CENTER[1] - com[1]) / D,
|
||||||
(PEN_CENTER[0] - far[0]) / D, (PEN_CENTER[1] - far[1]) / D,
|
(PEN_CENTER[0] - far[0]) / D, (PEN_CENTER[1] - far[1]) / D,
|
||||||
radius / D,
|
radius / D,
|
||||||
mean_disp / D,
|
second_far_dist / D,
|
||||||
frac_active,
|
frac_active,
|
||||||
], dtype=np.float32)
|
], dtype=np.float32)
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,223 @@
|
|||||||
|
"""
|
||||||
|
Episode-level diagnostics for the herding policy.
|
||||||
|
|
||||||
|
Runs N episodes and for each one tracks:
|
||||||
|
- flock radius over time
|
||||||
|
- COM-to-pen distance over time
|
||||||
|
- dog position over time
|
||||||
|
- when (if ever) the flock first became compact
|
||||||
|
- failure mode classification
|
||||||
|
|
||||||
|
Then produces:
|
||||||
|
1. Console summary of failure modes
|
||||||
|
2. Per-episode time-series plots (radius + com_dist)
|
||||||
|
3. Optional rendered playback of the worst episodes
|
||||||
|
|
||||||
|
Usage
|
||||||
|
-----
|
||||||
|
python diagnose.py --model runs/ppo_consolidation/final_model.zip \
|
||||||
|
--vecnorm runs/ppo_consolidation/vecnorm.pkl \
|
||||||
|
--n-sheep 5 --episodes 20
|
||||||
|
|
||||||
|
# Watch the policy live (first episode rendered):
|
||||||
|
python diagnose.py ... --render
|
||||||
|
|
||||||
|
# Save plots to a directory instead of showing interactively:
|
||||||
|
python diagnose.py ... --plot-dir debug_plots/
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import matplotlib.patches as mpatches
|
||||||
|
|
||||||
|
from stable_baselines3 import PPO
|
||||||
|
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
|
||||||
|
from herding_env import HerdingEnv
|
||||||
|
|
||||||
|
|
||||||
|
# ── failure mode constants ────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
COMPACT_RADIUS = 5.0 # must match DRIVE_GATE_RADIUS in herding_env.py
|
||||||
|
|
||||||
|
|
||||||
|
def classify_failure(ep_radius, ep_com_dist, n_penned, n_sheep, success):
|
||||||
|
if success:
|
||||||
|
return "SUCCESS"
|
||||||
|
if min(ep_radius) > COMPACT_RADIUS:
|
||||||
|
return "NEVER_COMPACT" # flock was always too scattered
|
||||||
|
first_compact = next(i for i, r in enumerate(ep_radius) if r <= COMPACT_RADIUS)
|
||||||
|
min_com_after = min(ep_com_dist[first_compact:])
|
||||||
|
pen_close = 3.0 # COM within 3m of pen counts as "got close"
|
||||||
|
if min_com_after > pen_close:
|
||||||
|
return "COMPACT_CANT_DRIVE" # compacted but never drove to pen
|
||||||
|
if n_penned == 0:
|
||||||
|
return "DROVE_NO_SHEEP" # got near pen, nothing went in
|
||||||
|
return f"PARTIAL_{n_penned}of{n_sheep}" # some in, not all
|
||||||
|
|
||||||
|
|
||||||
|
# ── main ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
p = argparse.ArgumentParser()
|
||||||
|
p.add_argument("--model", required=True)
|
||||||
|
p.add_argument("--vecnorm", default=None)
|
||||||
|
p.add_argument("--n-sheep", type=int, default=5)
|
||||||
|
p.add_argument("--episodes", type=int, default=20)
|
||||||
|
p.add_argument("--max-steps", type=int, default=4000)
|
||||||
|
p.add_argument("--render", action="store_true",
|
||||||
|
help="Show matplotlib animation of the first episode")
|
||||||
|
p.add_argument("--plot-dir", default=None,
|
||||||
|
help="Save time-series plots here (one per episode)")
|
||||||
|
p.add_argument("--seed", type=int, default=0)
|
||||||
|
return p.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def make_env(n_sheep, max_steps, render_mode=None):
|
||||||
|
def _init():
|
||||||
|
return HerdingEnv(n_sheep=n_sheep, max_steps=max_steps,
|
||||||
|
render_mode=render_mode)
|
||||||
|
return _init
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
if args.plot_dir:
|
||||||
|
os.makedirs(args.plot_dir, exist_ok=True)
|
||||||
|
matplotlib.use("Agg")
|
||||||
|
|
||||||
|
render_mode = "human" if args.render else None
|
||||||
|
raw_env = DummyVecEnv([make_env(args.n_sheep, args.max_steps, render_mode)])
|
||||||
|
|
||||||
|
if args.vecnorm:
|
||||||
|
env = VecNormalize.load(args.vecnorm, raw_env)
|
||||||
|
env.training = False
|
||||||
|
env.norm_reward = False
|
||||||
|
else:
|
||||||
|
env = raw_env
|
||||||
|
|
||||||
|
model = PPO.load(args.model, env=env)
|
||||||
|
|
||||||
|
failure_counts = {}
|
||||||
|
all_ep_data = []
|
||||||
|
|
||||||
|
for ep in range(args.episodes):
|
||||||
|
obs = env.reset()
|
||||||
|
done = False
|
||||||
|
step = 0
|
||||||
|
|
||||||
|
ep_radius = []
|
||||||
|
ep_com_dist = []
|
||||||
|
ep_dog_x = []
|
||||||
|
ep_dog_y = []
|
||||||
|
ep_n_penned = []
|
||||||
|
|
||||||
|
while not done:
|
||||||
|
action, _ = model.predict(obs, deterministic=True)
|
||||||
|
obs, _, dones, infos = env.step(action)
|
||||||
|
done = dones[0]
|
||||||
|
step += 1
|
||||||
|
|
||||||
|
inner = env.envs[0] if hasattr(env, "envs") else env.venv.envs[0]
|
||||||
|
com, radius, _ = inner._flock_stats()
|
||||||
|
com_dist = float(np.linalg.norm(com - inner.PEN_CENTER))
|
||||||
|
n_penned = int(inner.penned[:inner.n_sheep].sum())
|
||||||
|
|
||||||
|
ep_radius.append(radius)
|
||||||
|
ep_com_dist.append(com_dist)
|
||||||
|
ep_dog_x.append(float(inner.dog_pos[0]))
|
||||||
|
ep_dog_y.append(float(inner.dog_pos[1]))
|
||||||
|
ep_n_penned.append(n_penned)
|
||||||
|
|
||||||
|
info = infos[0]
|
||||||
|
n_pen = info.get("n_penned", 0)
|
||||||
|
n_sheep = info.get("n_sheep", args.n_sheep)
|
||||||
|
success = n_pen == n_sheep
|
||||||
|
mode = classify_failure(ep_radius, ep_com_dist, n_pen, n_sheep, success)
|
||||||
|
|
||||||
|
failure_counts[mode] = failure_counts.get(mode, 0) + 1
|
||||||
|
|
||||||
|
compact_step = next((i for i, r in enumerate(ep_radius)
|
||||||
|
if r <= COMPACT_RADIUS), None)
|
||||||
|
min_radius = min(ep_radius)
|
||||||
|
min_com_dist = min(ep_com_dist)
|
||||||
|
|
||||||
|
print(f" ep {ep+1:>3} steps={step:>5} penned={n_pen}/{n_sheep}"
|
||||||
|
f" min_r={min_radius:.1f}m"
|
||||||
|
f" min_com={min_com_dist:.1f}m"
|
||||||
|
f" compact@step={compact_step if compact_step is not None else 'NEVER'}"
|
||||||
|
f" [{mode}]")
|
||||||
|
|
||||||
|
all_ep_data.append(dict(
|
||||||
|
ep=ep, radius=ep_radius, com_dist=ep_com_dist,
|
||||||
|
dog_x=ep_dog_x, dog_y=ep_dog_y, n_penned=ep_n_penned,
|
||||||
|
steps=step, mode=mode, success=success,
|
||||||
|
))
|
||||||
|
|
||||||
|
# ── per-episode time-series plot ──────────────────────────────────
|
||||||
|
if args.plot_dir or (not args.render and ep < 5):
|
||||||
|
fig, axes = plt.subplots(2, 1, figsize=(10, 6), sharex=True)
|
||||||
|
t = np.arange(len(ep_radius))
|
||||||
|
|
||||||
|
axes[0].plot(t, ep_radius, color="steelblue", label="flock radius (m)")
|
||||||
|
axes[0].axhline(COMPACT_RADIUS, color="orange", linestyle="--",
|
||||||
|
label=f"compact threshold ({COMPACT_RADIUS}m)")
|
||||||
|
if compact_step is not None:
|
||||||
|
axes[0].axvline(compact_step, color="green", linestyle=":",
|
||||||
|
alpha=0.6, label=f"first compact (step {compact_step})")
|
||||||
|
axes[0].set_ylabel("radius (m)")
|
||||||
|
axes[0].legend(fontsize=8)
|
||||||
|
axes[0].set_title(f"ep {ep+1} | n_sheep={n_sheep} | {mode}")
|
||||||
|
|
||||||
|
axes[1].plot(t, ep_com_dist, color="tomato", label="COM-to-pen dist (m)")
|
||||||
|
axes[1].set_ylabel("COM-to-pen (m)")
|
||||||
|
axes[1].set_xlabel("step")
|
||||||
|
axes[1].legend(fontsize=8)
|
||||||
|
|
||||||
|
plt.tight_layout()
|
||||||
|
if args.plot_dir:
|
||||||
|
fig.savefig(os.path.join(args.plot_dir, f"ep{ep+1:03d}_{mode}.png"),
|
||||||
|
dpi=100)
|
||||||
|
plt.close(fig)
|
||||||
|
else:
|
||||||
|
plt.show(block=False)
|
||||||
|
plt.pause(0.5)
|
||||||
|
|
||||||
|
env.close()
|
||||||
|
|
||||||
|
# ── summary ──────────────────────────────────────────────────────────────
|
||||||
|
print("\n" + "=" * 55)
|
||||||
|
print(f" Model : {args.model}")
|
||||||
|
print(f" n_sheep : {args.n_sheep} episodes : {args.episodes}")
|
||||||
|
print("-" * 55)
|
||||||
|
total = sum(failure_counts.values())
|
||||||
|
for mode, cnt in sorted(failure_counts.items(), key=lambda x: -x[1]):
|
||||||
|
bar = "█" * cnt
|
||||||
|
print(f" {mode:<26} {cnt:>3}/{total} {bar}")
|
||||||
|
print("-" * 55)
|
||||||
|
|
||||||
|
never_compact = failure_counts.get("NEVER_COMPACT", 0)
|
||||||
|
cant_drive = failure_counts.get("COMPACT_CANT_DRIVE", 0)
|
||||||
|
partial = sum(v for k, v in failure_counts.items() if k.startswith("PARTIAL"))
|
||||||
|
successes = failure_counts.get("SUCCESS", 0)
|
||||||
|
|
||||||
|
print(f"\n Diagnosis:")
|
||||||
|
if never_compact / total > 0.5:
|
||||||
|
print(" ► COLLECT problem: dog rarely compacts the flock.")
|
||||||
|
print(" → Phase-gate W_DRIVE, increase W_COLLECT, check alignment reward.")
|
||||||
|
if cant_drive / total > 0.3:
|
||||||
|
print(" ► DRIVE problem: flock compacts but doesn't reach pen.")
|
||||||
|
print(" → Check dog alignment, pen direction, W_DRIVE magnitude.")
|
||||||
|
if partial / total > 0.3:
|
||||||
|
print(" ► PARTIAL problem: some sheep penned, stragglers remain.")
|
||||||
|
print(" → Flock splits; need better straggler-chasing behavior.")
|
||||||
|
if successes / total > 0.5:
|
||||||
|
print(" ► Mostly working! Fine-tune for consistency.")
|
||||||
|
print("=" * 55)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
+49
-33
@@ -51,14 +51,17 @@ class HerdingEnv(gym.Env):
|
|||||||
WALL_MARGIN = 3.5
|
WALL_MARGIN = 3.5
|
||||||
|
|
||||||
# -----------------------------------------------------------------------
|
# -----------------------------------------------------------------------
|
||||||
# Reward weights (progress-based potential shaping + sparse bonuses)
|
# Reward weights (two-phase: collect first, then drive)
|
||||||
# -----------------------------------------------------------------------
|
# -----------------------------------------------------------------------
|
||||||
W_DRIVE = 2.0 # progress: flock COM moved toward pen
|
W_DRIVE = 2.0 # progress: COM moved toward pen (only when compact)
|
||||||
W_COLLECT = 2.0 # progress: flock radius shrank (was 0.5 — must match W_DRIVE)
|
W_COLLECT = 4.0 # progress: radius shrank (2× stronger when scattered)
|
||||||
W_ALIGN = 0.5 # position: dog on anti-pen side of flock COM
|
W_ALIGN = 0.5 # position: dog on anti-pen side of COM
|
||||||
W_PEN_BONUS = 10.0 # per sheep penned (was 5.0)
|
W_COMPACT_BONUS = 0.1 # per-step bonus for staying compact (sustained signal)
|
||||||
W_COMPLETE = 100.0 # all sheep penned (was 20.0 — must dominate dense rewards)
|
W_PEN_BONUS = 10.0 # per sheep penned
|
||||||
W_STEP_COST = 0.002 # time penalty
|
W_COMPLETE = 100.0 # all sheep penned
|
||||||
|
W_STEP_COST = 0.002 # time penalty
|
||||||
|
|
||||||
|
DRIVE_GATE_RADIUS = 5.0 # flock must compact below this (m) before drive reward fires
|
||||||
|
|
||||||
def __init__(self, n_sheep: int = 1, max_steps: int = 2000,
|
def __init__(self, n_sheep: int = 1, max_steps: int = 2000,
|
||||||
render_mode: str = None, random_n_sheep: bool = False):
|
render_mode: str = None, random_n_sheep: bool = False):
|
||||||
@@ -71,7 +74,7 @@ class HerdingEnv(gym.Env):
|
|||||||
|
|
||||||
# Fixed 13-dim observation regardless of n_sheep:
|
# Fixed 13-dim observation regardless of n_sheep:
|
||||||
# dog_pos(2) + rel_com(2) + rel_far(2) + com_to_pen(2)
|
# dog_pos(2) + rel_com(2) + rel_far(2) + com_to_pen(2)
|
||||||
# + far_to_pen(2) + radius(1) + mean_disp(1) + frac_penned(1)
|
# + far_to_pen(2) + radius(1) + second_far_dist(1) + frac_penned(1)
|
||||||
self.observation_space = spaces.Box(
|
self.observation_space = spaces.Box(
|
||||||
low=-np.inf, high=np.inf, shape=(13,), dtype=np.float32
|
low=-np.inf, high=np.inf, shape=(13,), dtype=np.float32
|
||||||
)
|
)
|
||||||
@@ -259,60 +262,73 @@ class HerdingEnv(gym.Env):
|
|||||||
return com, float(dists.max()), float(dists.mean())
|
return com, float(dists.max()), float(dists.mean())
|
||||||
|
|
||||||
def _obs(self) -> np.ndarray:
|
def _obs(self) -> np.ndarray:
|
||||||
com, radius, mean_disp = self._flock_stats()
|
com, radius, _ = self._flock_stats()
|
||||||
active_mask = ~self.penned[:self.n_sheep]
|
active_mask = ~self.penned[:self.n_sheep]
|
||||||
|
|
||||||
# Farthest active sheep from COM (outlier the dog needs to chase)
|
|
||||||
if active_mask.any():
|
if active_mask.any():
|
||||||
pts = self.sheep_pos[:self.n_sheep][active_mask]
|
pts = self.sheep_pos[:self.n_sheep][active_mask]
|
||||||
idx = int(np.argmax(np.linalg.norm(pts - com, axis=1)))
|
dists = np.linalg.norm(pts - com, axis=1)
|
||||||
far = pts[idx]
|
sorted_idx = np.argsort(dists)[::-1] # farthest first
|
||||||
|
far = pts[sorted_idx[0]]
|
||||||
|
# 2nd farthest — if only 1 active sheep, reuse the same position
|
||||||
|
far2 = pts[sorted_idx[1]] if len(sorted_idx) > 1 else far
|
||||||
|
second_far_dist = float(dists[sorted_idx[1]]) if len(sorted_idx) > 1 else 0.0
|
||||||
else:
|
else:
|
||||||
far = self.PEN_CENTER.copy()
|
far = far2 = self.PEN_CENTER.copy()
|
||||||
|
second_far_dist = 0.0
|
||||||
|
|
||||||
S = self.FIELD # normalisation scale for positions
|
S = self.FIELD
|
||||||
D = 2 * self.FIELD # for relative vectors that can span the whole field
|
D = 2 * self.FIELD
|
||||||
|
|
||||||
return np.array([
|
return np.array([
|
||||||
self.dog_pos[0] / S, self.dog_pos[1] / S, # dog abs pos
|
self.dog_pos[0] / S, self.dog_pos[1] / S,
|
||||||
(com[0] - self.dog_pos[0]) / D, # COM relative to dog
|
(com[0] - self.dog_pos[0]) / D,
|
||||||
(com[1] - self.dog_pos[1]) / D,
|
(com[1] - self.dog_pos[1]) / D,
|
||||||
(far[0] - self.dog_pos[0]) / D, # farthest relative to dog
|
(far[0] - self.dog_pos[0]) / D,
|
||||||
(far[1] - self.dog_pos[1]) / D,
|
(far[1] - self.dog_pos[1]) / D,
|
||||||
(self.PEN_CENTER[0] - com[0]) / D, # COM to pen
|
(self.PEN_CENTER[0] - com[0]) / D,
|
||||||
(self.PEN_CENTER[1] - com[1]) / D,
|
(self.PEN_CENTER[1] - com[1]) / D,
|
||||||
(self.PEN_CENTER[0] - far[0]) / D, # farthest to pen
|
(self.PEN_CENTER[0] - far[0]) / D,
|
||||||
(self.PEN_CENTER[1] - far[1]) / D,
|
(self.PEN_CENTER[1] - far[1]) / D,
|
||||||
radius / D, # flock compactness
|
radius / D,
|
||||||
mean_disp / D, # mean spread
|
second_far_dist / D, # replaced mean_disp: 2nd farthest sheep from COM
|
||||||
active_mask.sum() / self.n_sheep, # fraction still active
|
active_mask.sum() / self.n_sheep,
|
||||||
], dtype=np.float32)
|
], dtype=np.float32)
|
||||||
|
|
||||||
def _reward(self, n_penned: int, newly_penned: int) -> float:
|
def _reward(self, n_penned: int, newly_penned: int) -> float:
|
||||||
com, radius, _ = self._flock_stats()
|
com, radius, _ = self._flock_stats()
|
||||||
com_dist = float(np.linalg.norm(com - self.PEN_CENTER))
|
com_dist = float(np.linalg.norm(com - self.PEN_CENTER))
|
||||||
|
|
||||||
# Progress rewards: positive when state improves
|
drive_delta = self._prev_com_dist - com_dist
|
||||||
drive_progress = (self._prev_com_dist - com_dist) * self.W_DRIVE
|
collect_delta = self._prev_radius - radius
|
||||||
collect_progress = (self._prev_radius - radius) * self.W_COLLECT
|
|
||||||
|
|
||||||
self._prev_com_dist = com_dist
|
self._prev_com_dist = com_dist
|
||||||
self._prev_radius = radius
|
self._prev_radius = radius
|
||||||
|
|
||||||
# Alignment: reward dog for being on the anti-pen side of the flock
|
# Alignment: dog on anti-pen side of COM, gated by proximity.
|
||||||
# COM, gated by proximity so only nearby positioning counts.
|
|
||||||
# +1 = dog directly behind flock, -1 = dog on pen side (wrong).
|
|
||||||
d_dog_com = float(np.linalg.norm(self.dog_pos - com))
|
d_dog_com = float(np.linalg.norm(self.dog_pos - com))
|
||||||
if d_dog_com > 0.1 and com_dist > 0.1:
|
if d_dog_com > 0.1 and com_dist > 0.1:
|
||||||
pen_dir = (self.PEN_CENTER - com) / com_dist # COM → pen
|
pen_dir = (self.PEN_CENTER - com) / com_dist
|
||||||
dog_dir = (self.dog_pos - com) / d_dog_com # COM → dog
|
dog_dir = (self.dog_pos - com) / d_dog_com
|
||||||
cosine = -float(np.dot(pen_dir, dog_dir)) # +1 when opposite
|
cosine = -float(np.dot(pen_dir, dog_dir))
|
||||||
proximity = max(0.0, 1.0 - d_dog_com / self.FLEE_DIST)
|
proximity = max(0.0, 1.0 - d_dog_com / self.FLEE_DIST)
|
||||||
alignment = cosine * proximity * self.W_ALIGN
|
alignment = cosine * proximity * self.W_ALIGN
|
||||||
else:
|
else:
|
||||||
alignment = 0.0
|
alignment = 0.0
|
||||||
|
|
||||||
reward = drive_progress + collect_progress + alignment
|
scattered = radius > self.DRIVE_GATE_RADIUS
|
||||||
|
|
||||||
|
# Collect always on; 2× scale when scattered to force collect-first.
|
||||||
|
r_collect = collect_delta * self.W_COLLECT * (2.0 if scattered else 1.0)
|
||||||
|
|
||||||
|
# Drive only fires when flock is compact — prevents rewarding COM movement
|
||||||
|
# while sheep are spread across the field.
|
||||||
|
r_drive = 0.0 if scattered else drive_delta * self.W_DRIVE
|
||||||
|
|
||||||
|
# Small sustained reward for maintaining a compact flock.
|
||||||
|
r_compact = 0.0 if scattered else self.W_COMPACT_BONUS
|
||||||
|
|
||||||
|
reward = r_drive + r_collect + r_compact + alignment
|
||||||
reward += newly_penned * self.W_PEN_BONUS
|
reward += newly_penned * self.W_PEN_BONUS
|
||||||
reward -= self.W_STEP_COST
|
reward -= self.W_STEP_COST
|
||||||
if n_penned == self.n_sheep:
|
if n_penned == self.n_sheep:
|
||||||
|
|||||||
Reference in New Issue
Block a user