Removing deduplication system of configs
This commit is contained in:
@@ -399,86 +399,6 @@ class EphemeralVastRunner:
|
|||||||
return "generator/run.py", "generator/outputs"
|
return "generator/run.py", "generator/outputs"
|
||||||
return "classifier/run.py", "classifier/outputs"
|
return "classifier/run.py", "classifier/outputs"
|
||||||
|
|
||||||
# Merge two dicts recursively (override wins on leaf keys)
|
|
||||||
@staticmethod
|
|
||||||
def _deep_merge_dicts(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
|
|
||||||
result = base.copy()
|
|
||||||
for key, value in override.items():
|
|
||||||
if key in result and isinstance(result[key], dict) and isinstance(value, dict):
|
|
||||||
result[key] = EphemeralVastRunner._deep_merge_dicts(result[key], value)
|
|
||||||
else:
|
|
||||||
result[key] = value
|
|
||||||
return result
|
|
||||||
|
|
||||||
# Recursively resolve "extends" references in JSON configs with cycle detection
|
|
||||||
def _load_config_with_extends(self, config_path: Path, seen: set[Path] | None = None) -> dict[str, Any]:
|
|
||||||
if seen is None:
|
|
||||||
seen = set()
|
|
||||||
resolved = config_path.resolve()
|
|
||||||
if resolved in seen:
|
|
||||||
raise ValueError(f"Circular config inheritance detected at: {config_path}")
|
|
||||||
seen.add(resolved)
|
|
||||||
with open(config_path, encoding="utf-8") as fh:
|
|
||||||
cfg = json.load(fh)
|
|
||||||
base_ref = cfg.pop("extends", None)
|
|
||||||
if not base_ref:
|
|
||||||
seen.remove(resolved)
|
|
||||||
return cfg
|
|
||||||
base_path = (config_path.parent / base_ref).resolve()
|
|
||||||
base_cfg = self._load_config_with_extends(base_path, seen=seen)
|
|
||||||
seen.remove(resolved)
|
|
||||||
return self._deep_merge_dicts(base_cfg, cfg)
|
|
||||||
|
|
||||||
# Resolve a config the same way the runners do: extends chain first,
|
|
||||||
# then shared.json overlaid underneath. Mirrors load_config() in
|
|
||||||
# classifier/src/utils/config.py and generator/src/utils/config.py — keep
|
|
||||||
# in sync if merge semantics ever change.
|
|
||||||
def _load_config_merged(self, config_path: Path) -> dict[str, Any]:
|
|
||||||
cfg = self._load_config_with_extends(config_path)
|
|
||||||
shared_path = config_path.parent.parent / "shared.json"
|
|
||||||
if shared_path.exists():
|
|
||||||
with open(shared_path, encoding="utf-8") as fh:
|
|
||||||
shared_cfg = json.load(fh)
|
|
||||||
cfg = self._deep_merge_dicts(shared_cfg, cfg)
|
|
||||||
return cfg
|
|
||||||
|
|
||||||
# Return a stable signature used to detect duplicate training configs
|
|
||||||
def _normalized_config_signature(self, config_path: Path) -> str:
|
|
||||||
cfg = self._load_config_merged(config_path)
|
|
||||||
# run_name should not influence whether two configs are equivalent to train.
|
|
||||||
cfg.pop("run_name", None)
|
|
||||||
return json.dumps(cfg, sort_keys=True, separators=(",", ":"))
|
|
||||||
|
|
||||||
# Detect configs that are pure extends (only run_name + extends, no new training settings)
|
|
||||||
# These are pointers to an already-trained experiment and should be skipped
|
|
||||||
def _is_pure_extend(self, config_path: Path) -> str | None:
|
|
||||||
with open(config_path, encoding="utf-8") as fh:
|
|
||||||
raw = json.load(fh)
|
|
||||||
if "extends" not in raw:
|
|
||||||
return None
|
|
||||||
non_meta = {k for k in raw if k not in ("run_name", "extends")}
|
|
||||||
if non_meta:
|
|
||||||
return None
|
|
||||||
base_path = (config_path.parent / raw["extends"]).resolve()
|
|
||||||
return str(base_path.relative_to(self.project_root))
|
|
||||||
|
|
||||||
def _dedupe_training_configs(self, config_paths: list[Path]) -> list[Path]:
|
|
||||||
seen: dict[str, Path] = {}
|
|
||||||
deduped: list[Path] = []
|
|
||||||
for cp in config_paths:
|
|
||||||
pure_extends = self._is_pure_extend(cp)
|
|
||||||
if pure_extends is not None:
|
|
||||||
print(f"Skipping {cp.name} (pure extend of {pure_extends})")
|
|
||||||
continue
|
|
||||||
sig = self._normalized_config_signature(cp)
|
|
||||||
if sig in seen:
|
|
||||||
first = seen[sig]
|
|
||||||
print(f"Skipping duplicate config {cp.name} (same training settings as {first.name})")
|
|
||||||
continue
|
|
||||||
seen[sig] = cp
|
|
||||||
deduped.append(cp)
|
|
||||||
return deduped
|
|
||||||
|
|
||||||
# ── remote directory checks ───────────────────────────────────────
|
# ── remote directory checks ───────────────────────────────────────
|
||||||
|
|
||||||
# Check whether a directory exists on the remote host
|
# Check whether a directory exists on the remote host
|
||||||
@@ -639,11 +559,6 @@ class EphemeralVastRunner:
|
|||||||
raise FileNotFoundError(f"Config not found: {cp}")
|
raise FileNotFoundError(f"Config not found: {cp}")
|
||||||
resolved.append(cp)
|
resolved.append(cp)
|
||||||
|
|
||||||
resolved = self._dedupe_training_configs(resolved)
|
|
||||||
# Abort early if all configs were duplicates
|
|
||||||
if not resolved:
|
|
||||||
raise ValueError("No unique configs to run after deduplication.")
|
|
||||||
|
|
||||||
n = len(resolved)
|
n = len(resolved)
|
||||||
_, first_output_root = self._module_for_config(resolved[0])
|
_, first_output_root = self._module_for_config(resolved[0])
|
||||||
local_output_root = self.project_root / first_output_root
|
local_output_root = self.project_root / first_output_root
|
||||||
|
|||||||
Reference in New Issue
Block a user