Generator runner improvement
This commit is contained in:
@@ -123,10 +123,3 @@ The default policy in `pipeline/defaults/vast.json` now targets:
|
|||||||
- `<= $0.20/hour`
|
- `<= $0.20/hour`
|
||||||
- sorted by `dlperf` descending
|
- sorted by `dlperf` descending
|
||||||
- uses `vastai/pytorch:latest` as the default image
|
- uses `vastai/pytorch:latest` as the default image
|
||||||
|
|
||||||
## Diagnostics
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 classifier/tools/analyze.py classifier/configs/phase2/p2_resnet18_facecrop.json
|
|
||||||
python3 classifier/tools/ensemble.py classifier/configs/phase4/p4_ensemble.json
|
|
||||||
```
|
|
||||||
|
|||||||
+1
-1
@@ -37,7 +37,7 @@ def main(config_path, *, data_dir_override=None, output_root="generator/outputs"
|
|||||||
|
|
||||||
cfg = load_config(config_path)
|
cfg = load_config(config_path)
|
||||||
|
|
||||||
run_name = cfg["run_name"]
|
run_name = cfg.get("run_name", Path(config_path).stem)
|
||||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
data_dir = data_dir_override or cfg.get("data_dir", "data")
|
data_dir = data_dir_override or cfg.get("data_dir", "data")
|
||||||
output_root = Path(output_root)
|
output_root = Path(output_root)
|
||||||
|
|||||||
+1
-1
@@ -7,7 +7,7 @@ from pipeline.orchestrator import EphemeralVastRunner, RunOptions
|
|||||||
# Accept one or more config files, or a single directory (all *.json inside, sorted)
|
# Accept one or more config files, or a single directory (all *.json inside, sorted)
|
||||||
def _resolve_configs(raw: list[str]) -> list[Path]:
|
def _resolve_configs(raw: list[str]) -> list[Path]:
|
||||||
if len(raw) == 1 and Path(raw[0]).is_dir():
|
if len(raw) == 1 and Path(raw[0]).is_dir():
|
||||||
configs = sorted(Path(raw[0]).glob("*.json"))
|
configs = sorted(p for p in Path(raw[0]).glob("*.json") if not p.name.startswith("_"))
|
||||||
if not configs:
|
if not configs:
|
||||||
raise ValueError(f"No JSON configs found in directory: {raw[0]}")
|
raise ValueError(f"No JSON configs found in directory: {raw[0]}")
|
||||||
return configs
|
return configs
|
||||||
|
|||||||
@@ -429,21 +429,22 @@ class EphemeralVastRunner:
|
|||||||
seen.remove(resolved)
|
seen.remove(resolved)
|
||||||
return self._deep_merge_dicts(base_cfg, cfg)
|
return self._deep_merge_dicts(base_cfg, cfg)
|
||||||
|
|
||||||
# Return a stable signature used to detect duplicate training configs
|
# Resolve a config the same way the runners do: extends chain first,
|
||||||
def _normalized_config_signature(self, config_path: Path) -> str:
|
# then shared.json overlaid underneath. Mirrors load_config() in
|
||||||
run_script, _ = self._module_for_config(config_path)
|
# classifier/src/utils/config.py and generator/src/utils/config.py — keep
|
||||||
# Generator configs do not currently use shared/extends inheritance.
|
# in sync if merge semantics ever change.
|
||||||
if run_script.startswith("generator/"):
|
def _load_config_merged(self, config_path: Path) -> dict[str, Any]:
|
||||||
with open(config_path, encoding="utf-8") as fh:
|
|
||||||
cfg = json.load(fh)
|
|
||||||
else:
|
|
||||||
cfg = self._load_config_with_extends(config_path)
|
cfg = self._load_config_with_extends(config_path)
|
||||||
shared_path = config_path.parent.parent / "shared.json"
|
shared_path = config_path.parent.parent / "shared.json"
|
||||||
if shared_path.exists():
|
if shared_path.exists():
|
||||||
with open(shared_path, encoding="utf-8") as fh:
|
with open(shared_path, encoding="utf-8") as fh:
|
||||||
shared_cfg = json.load(fh)
|
shared_cfg = json.load(fh)
|
||||||
cfg = self._deep_merge_dicts(shared_cfg, cfg)
|
cfg = self._deep_merge_dicts(shared_cfg, cfg)
|
||||||
|
return cfg
|
||||||
|
|
||||||
|
# Return a stable signature used to detect duplicate training configs
|
||||||
|
def _normalized_config_signature(self, config_path: Path) -> str:
|
||||||
|
cfg = self._load_config_merged(config_path)
|
||||||
# run_name should not influence whether two configs are equivalent to train.
|
# run_name should not influence whether two configs are equivalent to train.
|
||||||
cfg.pop("run_name", None)
|
cfg.pop("run_name", None)
|
||||||
return json.dumps(cfg, sort_keys=True, separators=(",", ":"))
|
return json.dumps(cfg, sort_keys=True, separators=(",", ":"))
|
||||||
|
|||||||
Reference in New Issue
Block a user