Implementing inference sampling and quick tool polish

2026-05-04 21:05:06 +01:00
parent a750d177fa
commit a235dfd5f8
65 changed files with 151 additions and 64 deletions
@@ -1,24 +1,7 @@
-#!/usr/bin/env python3
-"""
-Pre-align face images using MTCNN landmarks + similarity transform.
-
-This is the generator-side counterpart to classifier/tools/facecrop.py.
-Difference: classifier uses bbox crop+resize; the generator wants landmark-based
-alignment so the eyes, nose, and mouth land at fixed pixel positions in every
-training image — structurally consistent training data for the generator.
-
-Output mirrors the source layout exactly:
-    data/wiki/14/37591914.jpg  ->  cropped/generator/wiki/14/37591914.jpg
-
-Resumable: already-aligned images are skipped by default.
-
-Usage:
-    python generator/tools/facecrop.py
-    python generator/tools/facecrop.py --data-dir data --output-dir cropped/generator
-    python generator/tools/facecrop.py --sources wiki --device cpu
-    python generator/tools/facecrop.py --size 128
-    python generator/tools/facecrop.py --no-skip-existing   # reprocess everything
-"""
+# Pre-align face images using MTCNN landmarks + similarity transform.
+# Generator-side counterpart to classifier/tools/facecrop.py — uses landmark-based alignment
+# (not bbox crop) so eyes, nose, and mouth land at fixed pixel positions in every image.
+# Usage: python generator/tools/facecrop.py [--data-dir data] [--output-dir cropped/generator] [--size 128]
 import argparse
 import sys
 import warnings
@@ -32,7 +15,7 @@ sys.path.insert(0, str(ROOT))
 # Generator trains on real images only (wiki). The other sources are AI-generated
 # and aren't used as training targets for the generator, so we don't align them
 # by default. Pass --sources to override.
-SOURCES = ["wiki"]
+SOURCES     = ["wiki"]
 ALL_SOURCES = ["wiki", "inpainting", "text2img", "insight"]

 # Reference landmark positions for a 128px aligned face.
@@ -50,44 +33,37 @@ _DETECTORS: dict[str, object] = {}


 def parse_args():
-    p = argparse.ArgumentParser(
-        description=__doc__,
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-    )
-    p.add_argument("--data-dir",   default="data",        help="Source dataset root (default: data)")
+    p = argparse.ArgumentParser()
+    p.add_argument("--data-dir",   default="data",             help="Source dataset root (default: data)")
    p.add_argument("--output-dir", default="cropped/generator", help="Output root (default: cropped/generator)")
-    p.add_argument("--size",       type=int, default=128, help="Output image size in px, square (default: 128)")
-    p.add_argument("--device",     default=None,          help="'cpu' or 'cuda'. Default: auto-detect")
+    p.add_argument("--size",       type=int, default=128,      help="Output image size in px, square (default: 128)")
+    p.add_argument("--device",     default=None,               help="'cpu' or 'cuda'. Default: auto-detect")
    p.add_argument("--sources",    nargs="+", default=None, metavar="SOURCE",
-                   help=f"Sources to process. Default: {', '.join(SOURCES)} (real images only). "
-                        f"All available: {', '.join(ALL_SOURCES)}")
+                   help=f"Sources to process. Default: {', '.join(SOURCES)}. All: {', '.join(ALL_SOURCES)}")
    p.add_argument("--skip-existing",    dest="skip_existing", action="store_true",  default=True,
-                   help="Skip images already present in output-dir (default: on, resumable)")
+                   help="Skip images already present in output-dir (default: on)")
    p.add_argument("--no-skip-existing", dest="skip_existing", action="store_false",
                   help="Re-process all images even if already aligned")
    return p.parse_args()


-# ── alignment helpers ─────────────────────────────────────────────────────────
+# ── Alignment helpers ─────────────────────────────────────────────────────────

+# Scale the 128px reference template to match the target size
 def _ref_landmarks(size: int):
-    """Reference landmarks scaled from the 128px template to `size`."""
    import numpy as np
    scale = size / 128.0
-    return np.asarray(
-        [(x * scale, y * scale) for x, y in REF_LANDMARKS_128],
-        dtype=np.float32,
-    )
+    return np.asarray([(x * scale, y * scale) for x, y in REF_LANDMARKS_128], dtype=np.float32)


+# Apply similarity transform so detected landmarks map to reference positions
 def _align_from_landmarks(img, landmarks, size: int):
-    """Apply similarity transform so detected landmarks map to reference positions."""
    import numpy as np
    from PIL import Image as PILImage
    from skimage.transform import SimilarityTransform, warp

-    src = np.asarray(landmarks, dtype=np.float32)  # 5x2 detected
-    dst = _ref_landmarks(size)                      # 5x2 reference
+    src = np.asarray(landmarks, dtype=np.float32)
+    dst = _ref_landmarks(size)

    try:
        tform = SimilarityTransform.from_estimate(src, dst)
@@ -105,35 +81,28 @@ def _align_from_landmarks(img, landmarks, size: int):

 def _center_crop(img, size: int):
    from PIL import Image as PILImage
-    w, h = img.size
-    side = min(w, h)
+    w, h  = img.size
+    side  = min(w, h)
    left, top = (w - side) // 2, (h - side) // 2
    return img.crop((left, top, left + side, top + side)).resize((size, size), PILImage.BILINEAR)


+# Returns (standard, relaxed) MTCNN detectors; cached per device
 def _get_detectors(device: str):
-    """Return (standard, relaxed) MTCNN detectors with landmarks enabled."""
    if device in _DETECTORS:
        return _DETECTORS[device]

    from facenet_pytorch import MTCNN

-    standard = MTCNN(
-        keep_all=False, select_largest=True,
-        min_face_size=15,
-        device=device, post_process=False,
-    )
-    relaxed = MTCNN(
-        keep_all=False, select_largest=True,
-        min_face_size=10,
-        thresholds=[0.5, 0.6, 0.6],
-        device=device, post_process=False,
-    )
+    standard = MTCNN(keep_all=False, select_largest=True, min_face_size=15,
+                     device=device, post_process=False)
+    relaxed  = MTCNN(keep_all=False, select_largest=True, min_face_size=10,
+                     thresholds=[0.5, 0.6, 0.6], device=device, post_process=False)
    _DETECTORS[device] = (standard, relaxed)
    return standard, relaxed


-# ── main ──────────────────────────────────────────────────────────────────────
+# ── Main ──────────────────────────────────────────────────────────────────────

 def main():
    args = parse_args()
@@ -158,7 +127,7 @@ def main():

    try:
        import facenet_pytorch  # noqa: F401
-        import skimage  # noqa: F401
+        import skimage          # noqa: F401
    except ImportError as exc:
        print(f"Error: missing dependency ({exc}).")
        print("       Run: pip install facenet-pytorch scikit-image")
@@ -182,9 +151,7 @@ def main():
    print(f"\nTotal images: {len(all_paths):,}\n")

    n_processed = n_skipped = n_error = 0
-    src_stats: dict[str, dict] = {
-        s: {"aligned": 0, "retry": 0, "fallback": 0} for s in sources
-    }
+    src_stats: dict[str, dict] = {s: {"aligned": 0, "retry": 0, "fallback": 0} for s in sources}

    for img_path in tqdm(all_paths, desc="Aligning", unit="img"):
        rel      = img_path.relative_to(data_dir)
@@ -215,8 +182,8 @@ def main():

            # Pass 2: upscale 2x and retry with relaxed thresholds
            if aligned is None:
-                w, h = img.size
-                img2x = img.resize((w * 2, h * 2), Image.BILINEAR)
+                w, h   = img.size
+                img2x  = img.resize((w * 2, h * 2), Image.BILINEAR)
                _, _, landmarks2 = relaxed.detect(img2x, landmarks=True)
                if landmarks2 is not None and len(landmarks2) > 0:
                    lm_orig = [(x / 2, y / 2) for x, y in landmarks2[0]]
@@ -253,9 +220,9 @@ def main():
    print(f"  {'Source':<12}  {'pass-1':>8}  {'pass-2':>8}  {'fallback':>8}  {'fallback%':>10}")
    print(f"  {'─'*12}  {'─'*8}  {'─'*8}  {'─'*8}  {'─'*10}")
    for src in sources:
-        s = src_stats[src]
+        s         = src_stats[src]
        total_src = s["aligned"] + s["retry"] + s["fallback"]
-        fb_pct = s["fallback"] / max(total_src, 1)
+        fb_pct    = s["fallback"] / max(total_src, 1)
        print(f"  {src:<12}  {s['aligned']:>8,}  {s['retry']:>8,}  {s['fallback']:>8,}  {fb_pct:>9.1%}")
    print(f"{'─' * 55}")
    print(f"  Output: {output_dir.resolve()}")
@@ -0,0 +1,120 @@
+# Generate 4x4 sample grids from Phase 5 EMA checkpoints, matching training visualization.
+# Usage: python generator/tools/sampling.py [--samples N] [--models p5_gan p5_vae p5_ddpm]
+import argparse
+import json
+import sys
+from pathlib import Path
+
+import torch
+from torchvision.utils import save_image
+
+ROOT = Path(__file__).resolve().parents[1]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+
+MODELS_DIR = ROOT / "outputs" / "models"
+CFG_DIR    = ROOT / "configs" / "phase5"
+
+# final_ema first: best_ema is the lowest-FID snapshot, which for slowly-converging models
+# (e.g. DDPM) can be saved while the EMA shadow is still close to random init.
+PHASE5_RUNS = {
+    "p5_gan":  {"type": "gan",  "config": "p5_gan.json",  "checkpoints": ["p5_gan_final_ema.pt",  "p5_gan_best_ema.pt"]},
+    "p5_vae":  {"type": "vae",  "config": "p5_vae.json",  "checkpoints": ["p5_vae_final_ema.pt",  "p5_vae_best_ema.pt"]},
+    "p5_ddpm": {"type": "ddpm", "config": "p5_ddpm.json", "checkpoints": ["p5_ddpm_final_ema.pt", "p5_ddpm_best_ema.pt"]},
+}
+
+
+def _load_cfg(name: str) -> dict:
+    with open(CFG_DIR / PHASE5_RUNS[name]["config"]) as f:
+        return json.load(f)
+
+
+def _load_model(name: str, cfg: dict, device: torch.device):
+    from src.models import get_model
+
+    result, _ = get_model(cfg)
+    # GAN returns (generator, critic); all others return the model directly
+    model = result[0] if isinstance(result, tuple) else result
+
+    for ckpt_name in PHASE5_RUNS[name]["checkpoints"]:
+        ckpt_path = MODELS_DIR / ckpt_name
+        if not ckpt_path.exists():
+            print(f"  [{name}] {ckpt_name} not found, trying next")
+            continue
+
+        state_dict = torch.load(ckpt_path, map_location=device, weights_only=True)
+        missing, unexpected = model.load_state_dict(state_dict, strict=False)
+        if missing or unexpected:
+            print(f"  [{name}] {ckpt_name}: missing={len(missing)} unexpected={len(unexpected)} — trying next")
+            continue
+        print(f"  [{name}] Loaded {ckpt_name}")
+        return model.to(device).eval()
+
+    raise FileNotFoundError(f"No usable EMA checkpoint found for {name}")
+
+
+# Returns a (16, C, H, W) tensor in [0, 1] ready for save_image with nrow=4
+@torch.no_grad()
+def _generate_grid(
+    name: str, model, cfg: dict, device: torch.device,
+    *, truncation: float | None = None,
+) -> torch.Tensor:
+    kind       = PHASE5_RUNS[name]["type"]
+    image_size = cfg.get("image_size", 64)
+
+    if kind == "gan":
+        z = torch.randn(16, cfg.get("latent_dim", 128), 1, 1, device=device)
+        if truncation is not None and truncation > 0:
+            z = z.clamp(-truncation, truncation)
+        imgs = model(z)
+    elif kind == "vae":
+        imgs = model.sample(16, device)
+    elif kind == "ddpm":
+        from src.training.diffusion import cosine_betas, linear_betas, make_alpha_bars, ddim_sample
+        T          = cfg.get("T", 1000)
+        betas      = (cosine_betas(T) if cfg.get("noise_schedule", "cosine") == "cosine" else linear_betas(T)).to(device)
+        alpha_bars = make_alpha_bars(betas)
+        # n_steps=50 mirrors the training-time preview; cfg's ddim_steps is for FID only
+        imgs = ddim_sample(
+            model, 16, image_size, alpha_bars,
+            n_steps=50, pred_type=cfg.get("pred_type", "eps"),
+            device=str(device), batch_size=16,
+        )
+    else:
+        raise ValueError(f"Unknown model type: {kind}")
+
+    return (imgs.clamp(-1, 1) + 1.0) / 2.0
+
+
+def main():
+    p = argparse.ArgumentParser()
+    p.add_argument("--samples",    type=int,  default=10, help="Number of 4x4 grids per model")
+    p.add_argument("--output-dir", type=Path,
+                   default=ROOT / "outputs" / "samples" / "final_comparison")
+    p.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu")
+    p.add_argument("--truncation", type=float, default=None,
+                   help="Optional GAN latent truncation (lower=less diversity but sharper)")
+    p.add_argument("--models", nargs="+", choices=list(PHASE5_RUNS.keys()),
+                   default=list(PHASE5_RUNS.keys()))
+    args   = p.parse_args()
+    device = torch.device(args.device)
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+
+    for name in args.models:
+        print(f"[{name}] Loading checkpoint...")
+        cfg   = _load_cfg(name)
+        model = _load_model(name, cfg, device)
+
+        out_dir = args.output_dir / name
+        out_dir.mkdir(exist_ok=True)
+
+        print(f"[{name}] Generating {args.samples} grids -> {out_dir}")
+        for i in range(1, args.samples + 1):
+            grid = _generate_grid(name, model, cfg, device, truncation=args.truncation)
+            save_image(grid, out_dir / f"grid_{i:04d}.png", nrow=4)
+
+    print("Done.")
+
+
+if __name__ == "__main__":
+    main()