diff --git a/generator/notebooks/_build.py b/generator/notebooks/_build.py
deleted file mode 100644
index bf281ae..0000000
--- a/generator/notebooks/_build.py
+++ /dev/null
@@ -1,1607 +0,0 @@
-"""
-Build all generator analysis notebooks from a single source of truth.
-
-Run from generator/notebooks/:  python _build.py
-
-The notebooks are report chapters, not experiment launchers. They load saved
-logs, samples, checkpoints, and figures only. Each phase follows a consistent
-story structure: goal, what changed, evidence, decision, and conclusion.
-
-Real metric values are pulled from outputs/logs/*.json at build time and
-rendered into markdown headers and conclusions, so reports never drift from data.
-The generated filenames are numbered to make the intended reading order clear.
-"""
-import json
-from pathlib import Path
-
-ROOT = Path(__file__).resolve().parents[1]
-LOGS = ROOT / "outputs" / "logs"
-OUT  = ROOT / "notebooks"
-
-NOTEBOOK_SEQUENCE = {
-    "phase0": "01_baseline_sanity_check",
-    "phase1": "02_pipeline_selection",
-    "phase2": "03_gan_stability_progression",
-    "phase3": "04_vae_loss_progression",
-    "phase4": "05_ddpm_recipe_progression",
-    "phase5": "06_final_family_comparison",
-    "phase6": "07_final_sample_showcase",
-}
-
-OLD_GENERATED_PATTERNS = ["phase*.ipynb"]
-
-
-# notebook helpers
-def md(text):     return {"cell_type": "markdown", "metadata": {}, "source": text.splitlines(keepends=True)}
-def code(text):   return {"cell_type": "code", "metadata": {}, "execution_count": None, "outputs": [], "source": text.splitlines(keepends=True)}
-
-def write_nb(name, cells):
-    nb = {
-        "cells": cells,
-        "metadata": {
-            "kernelspec":  {"name": "python3", "display_name": "Python 3"},
-            "language_info": {"name": "python"},
-        },
-        "nbformat": 4,
-        "nbformat_minor": 5,
-    }
-    path = OUT / f"{name}.ipynb"
-    path.write_text(json.dumps(nb, indent=1))
-    print(f"  wrote {path.relative_to(ROOT)}")
-
-def remove_old_generated_notebooks():
-    for pattern in OLD_GENERATED_PATTERNS:
-        for path in OUT.glob(pattern):
-            path.unlink()
-            print(f"  removed {path.relative_to(ROOT)}")
-
-
-# log-derived facts, computed once and baked into markdown
-def load(name):
-    p = LOGS / f"{name}.json"
-    return json.load(open(p)) if p.exists() else None
-
-def best_fid(log):
-    fid = log.get("history", {}).get("fid", {}) if log else {}
-    if not fid: return None, None
-    items = sorted(((int(k), v) for k, v in fid.items()))
-    e, v = min(items, key=lambda x: x[1])
-    return e, v
-
-def time_min(log):
-    t = (log or {}).get("history", {}).get("train_time_s")
-    return t/60 if t else None
-
-def get_fid(log, epoch):
-    """Build-time helper (mirrors the runtime helper in SHARED_IMPORTS)."""
-    return (log or {}).get("history", {}).get("fid", {}).get(str(epoch))
-
-
-# shared imports cell, used by all phase notebooks
-SHARED_IMPORTS = """\
-import json
-from pathlib import Path
-
-import matplotlib.pyplot as plt
-import matplotlib.image as mpimg
-import numpy as np
-import pandas as pd
-
-plt.rcParams.update({"figure.dpi": 120, "font.size": 10})
-
-try:
-    display
-except NameError:
-    def display(obj):
-        print(obj)
-
-def find_generator_root():
-    for base in [Path.cwd(), *Path.cwd().parents]:
-        for candidate in [base, base / "generator"]:
-            if (candidate / "outputs" / "logs").exists() and (candidate / "outputs" / "samples").exists():
-                return candidate.resolve()
-    raise FileNotFoundError("Could not locate generator/outputs from the current working directory")
-
-GENERATOR_ROOT = find_generator_root()
-PROJECT_ROOT = GENERATOR_ROOT.parent
-OUTPUTS = GENERATOR_ROOT / "outputs"
-LOGS    = OUTPUTS / "logs"
-SAMPLES = OUTPUTS / "samples"
-
-
-def load_log(name):
-    p = LOGS / f"{name}.json"
-    return json.load(open(p)) if p.exists() else None
-
-def get_fid(log, epoch):
-    fid = log.get("history", {}).get("fid", {})
-    return fid.get(str(epoch))
-
-def fid_series(log):
-    fid = log.get("history", {}).get("fid", {})
-    items = sorted((int(k), v) for k, v in fid.items())
-    return [e for e, _ in items], [v for _, v in items]
-
-def show_image_or_missing(ax, path, title=None):
-    if path.exists():
-        ax.imshow(mpimg.imread(str(path)))
-    else:
-        ax.text(0.5, 0.5, f"missing artifact\\n{path.name}", ha="center", va="center", transform=ax.transAxes)
-    if title:
-        ax.set_title(title, fontsize=9)
-    ax.axis("off")
-
-"""
-
-
-# PHASE 0 - Baseline sanity check
-def build_phase0():
-    p0 = {n: load(n) for n in ["p0_wgan", "p0_vae", "p0_ddpm", "p0_ddpm_small"]}
-    cells = [
-        md(f"""\
-# 01 - Baseline Sanity Check
-
-Phase 0 is the starting point of the generator story. It uses the raw, un-aligned
-images and very plain versions of each model family so we can confirm that the
-training code runs end-to-end before making any stronger claims.
-
-The goal is not to choose a final model here. The goal is to expose the first
-failure modes: rough WGAN blobs, blurry VAE averages, and noisy DDPM textures.
-Those failures motivate the pipeline work in Phase 1.
-
-## What this phase changes
-
-Nothing is optimized yet. This phase keeps the input pipeline rough and uses the
-minimal available recipes:
-
-| Run | Family | Logged epochs | Purpose |
-|---|---|---:|---|
-| `p0_wgan` | WGAN-GP | {len(p0['p0_wgan']['history']['g_loss']) if p0['p0_wgan'] else 'n/a'} | Basic generator/critic sanity check |
-| `p0_vae` | VAE | {len(p0['p0_vae']['history']['loss']) if p0['p0_vae'] else 'n/a'} | MSE + KL reconstruction baseline |
-| `p0_ddpm` | DDPM | {len(p0['p0_ddpm']['history']['loss']) if p0['p0_ddpm'] else 'n/a'} | Linear schedule, epsilon-prediction baseline |
-| `p0_ddpm_small` | DDPM small | {len(p0['p0_ddpm_small']['history']['loss']) if p0['p0_ddpm_small'] else 'n/a'} | Reduced-capacity sanity variant |
-
-FID was not logged in Phase 0. The evidence here is loss behavior plus saved
-sample grids.
-"""),
-        code(SHARED_IMPORTS),
-        md("## 1. Training loss curves\n\nThese curves check that the loops ran and produced stable logs. They are not enough to prove visual quality, but they are needed before interpreting samples: a broken optimization loop would make every later visual comparison meaningless.\n\n**What to look for:** the curves should move smoothly enough to show that each family is learning something. The limitation is that loss scale differs by family, so the curves compare stability, not final image quality."),
-        code("""\
-runs = {n: load_log(n) for n in ["p0_wgan", "p0_vae", "p0_ddpm", "p0_ddpm_small"]}
-runs = {k: v for k, v in runs.items() if v}
-
-fig, axes = plt.subplots(1, 3, figsize=(16, 4))
-
-# WGAN: g_loss + w_dist
-h = runs["p0_wgan"]["history"]
-ep = range(1, len(h["g_loss"]) + 1)
-axes[0].plot(ep, h["g_loss"], label="G loss", color="#5B8DB8")
-axes[0].plot(ep, h["c_loss"], label="C loss", color="#E8705A", alpha=0.7)
-axes[0].set_title("p0_wgan - generator vs critic loss")
-axes[0].set_xlabel("Epoch"); axes[0].legend()
-
-# VAE: total loss + components
-h = runs["p0_vae"]["history"]
-ep = range(1, len(h["loss"]) + 1)
-axes[1].plot(ep, h["recon_loss"], label="Recon (MSE)", color="#5B8DB8")
-axes[1].plot(ep, h["kl_loss"],    label="KL",          color="#E8705A")
-axes[1].set_title("p0_vae - recon vs KL")
-axes[1].set_xlabel("Epoch"); axes[1].legend()
-
-# DDPM: noise prediction MSE
-h = runs["p0_ddpm"]["history"]
-ep = range(1, len(h["loss"]) + 1)
-axes[2].plot(ep, h["loss"], color="#5B8DB8", label="epsilon-MSE")
-if "p0_ddpm_small" in runs:
-    h2 = runs["p0_ddpm_small"]["history"]
-    axes[2].plot(range(1, len(h2["loss"]) + 1), h2["loss"], color="#E8705A", linestyle="--", label="small variant")
-axes[2].set_title("p0_ddpm - noise prediction loss")
-axes[2].set_xlabel("Epoch"); axes[2].legend()
-
-plt.tight_layout(); plt.show()
-"""),
-        md("## 2. Final sample grids\n\nThe final previews show the practical failure mode of the raw pipeline: the samples have some face-like structure, but identity, alignment, and detail are not under control. These PNGs are displayed exactly as saved, so older Phase 0 matrices keep their original layout instead of being forced into 4x4.\n\n**Why this matters:** this is the visual evidence that the first bottleneck is not only the model family. The data still contains too much pose, scale, and background variation for tiny baseline recipes."),
-        code("""\
-last_epochs = {"p0_wgan": 200, "p0_vae": 100, "p0_ddpm": 200, "p0_ddpm_small": 100}
-
-fig, axes = plt.subplots(1, 4, figsize=(16, 4))
-for ax, (name, ep) in zip(axes, last_epochs.items()):
-    img_path = SAMPLES / name / f"epoch_{ep:04d}.png"
-    if img_path.exists():
-        ax.imshow(mpimg.imread(str(img_path)))
-        ax.set_title(f"{name}\\n(epoch {ep})", fontsize=10)
-    else:
-        ax.set_title(f"{name}\\n(missing)", fontsize=10)
-    ax.axis("off")
-plt.tight_layout(); plt.show()
-"""),
-        md("## 3. Progression - early vs late\n\nThe progression grids make the baseline failure visible over time. Later samples improve slightly, but the raw input distribution keeps the task too broad. The saved matrices are shown in their original layout.\n\n**How to read it:** if more epochs only turn noise into rough face-like blobs, the next decision should be pipeline cleanup rather than simply training the same recipe longer."),
-        code("""\
-checkpoints = {
-    "p0_wgan":       [50, 100, 200],
-    "p0_vae":        [25, 50, 100],
-    "p0_ddpm":       [50, 100, 200],
-}
-
-for name, eps in checkpoints.items():
-    fig, axes = plt.subplots(1, len(eps), figsize=(12, 4))
-    for ax, e in zip(axes, eps):
-        p = SAMPLES / name / f"epoch_{e:04d}.png"
-        if p.exists():
-            ax.imshow(mpimg.imread(str(p))); ax.set_title(f"epoch {e}", fontsize=9)
-        else:
-            ax.text(0.5, 0.5, f"epoch {e}\\n(missing)", ha="center", va="center", transform=ax.transAxes)
-        ax.axis("off")
-    fig.suptitle(name, fontsize=11, fontweight="bold")
-    plt.tight_layout(); plt.show()
-"""),
-        md("""\
-## 4. What this phase proves
-
-Phase 0 proves that the code path works, but it also proves that raw images are
-too noisy a starting point for the rest of the project. The WGAN produces rough
-color blobs, the VAE averages faces into blur, and the DDPM is the most textured
-but still noisy.
-
-**Decision:** treat data quality as the first bottleneck. Phase 1 therefore
-locks the pipeline before the project spends more compute on stronger recipes.
-
-**Report conclusion:** Phase 0 is a sanity check, not a competitive result. It
-establishes the baseline failure and motivates the move to aligned face crops.
-"""),
-    ]
-    write_nb(NOTEBOOK_SEQUENCE["phase0"], cells)
-
-
-# PHASE 1 - Pipeline ablations with a DCGAN proxy
-def build_phase1():
-    runs = {n: load(n) for n in ["p1a_dcgan_64", "p1a_dcgan_128", "p1b_dcgan_full",
-                                  "p1b_dcgan_aligned", "p1c_dcgan_hflip",
-                                  "p1c_dcgan_full_aug", "p1d_dcgan_combined"]}
-    best_run = min(runs.items(), key=lambda kv: best_fid(kv[1])[1] or 9e9)
-    best_name, best_log = best_run
-    _, best_val = best_fid(best_log)
-    p0_gan = load("p0_wgan")
-
-    cells = [
-        md(f"""\
-# 02 - Pipeline Selection
-
-Phase 1 answers the data-handling question left open by the baseline. Instead
-of changing the model family, it uses a cheap DCGAN proxy and varies one
-pipeline choice at a time.
-
-This phase is deliberately controlled. The output quality is still limited, but
-the relative differences tell us which input pipeline gives later recipes the
-best chance.
-
-## What this phase changes
-
-Four pipeline choices are tested as ablations:
-
-| Ablation | Question | Choices |
-|---|---|---|
-| 1A | How much resolution can the proxy handle? | 64x64 vs 128x128 |
-| 1B | Does alignment matter? | Full raw image vs MTCNN-aligned crop |
-| 1C | Does augmentation help the proxy? | H-flip only vs H-flip + rotation + color jitter |
-| 1D | Should raw and aligned images be mixed? | Aligned only vs aligned + raw mixed |
-
-**Headline result:** `{best_name}` reaches **FID@50 = {best_val:.1f}**. The
-locked pipeline for the following phases is aligned face crops at 64x64, no
-raw/aligned mixing, with augmentation choices following the saved family configs.
-"""),
-        md(f"""\
-### Reference: Phase 0 baseline from the same family
-
-The phase 0 WGAN-GP (`p0_wgan`) trained on raw un-aligned images for {len(p0_gan['history']['g_loss']) if p0_gan else 200} epochs
-without any pipeline tuning, and it also collapsed. Phase 1 below uses the same model class
-with the data pipeline systematically varied; the architecture limitation is constant.
-"""),
-        code(SHARED_IMPORTS),
-        md("## 1. Load all experiment logs\n\nAll evidence in this notebook comes from the existing Phase 1 logs and sample folders. The cell is intentionally simple: it only inventories already saved experiments so the reader knows which pipeline ablations are being compared."),
-        code("""\
-run_names = sorted(p.stem for p in LOGS.glob("p1*.json"))
-runs = {name: load_log(name) for name in run_names}
-runs = {k: v for k, v in runs.items() if v}
-
-print(f"Loaded {len(runs)} experiments:")
-for name in run_names: print(f"  {name}")
-"""),
-        code("""\
-experiment_groups = {
-    "1A - Resolution":       {"p1a_dcgan_64":       "64x64 (raw)",
-                              "p1a_dcgan_128":      "128x128 (raw)"},
-    "1B - Alignment":        {"p1b_dcgan_full":     "Full image (raw)",
-                              "p1b_dcgan_aligned":  "MTCNN-aligned"},
-    "1C - Augmentation":     {"p1c_dcgan_hflip":    "H-flip only",
-                              "p1c_dcgan_full_aug": "H-flip + rot + colour"},
-    "1D - Dataset mixing":   {"p1b_dcgan_aligned":  "Aligned only",
-                              "p1d_dcgan_combined": "Aligned + raw mixed"},
-}
-"""),
-        md("## 2. FID comparison table\n\nThe table ranks the proxy runs. It is needed because the visual samples alone can be misleading: a run can look slightly better in one grid while still being worse across the saved distribution. The values are useful within Phase 1, but they should not be compared directly with later FID protocols."),
-        code("""\
-rows = []
-for name in run_names:
-    r = runs[name]; cfg = r["config"]
-    rows.append({
-        "Experiment":   name,
-        "Size":         f"{cfg.get('image_size')}x{cfg.get('image_size')}",
-        "Augment":      cfg.get("augment", False),
-        "FID@25":       get_fid(r, 25),
-        "FID@50":       get_fid(r, 50),
-        "G loss (ep50)": r["history"]["g_loss"][-1],
-        "D loss (ep50)": r["history"]["d_loss"][-1],
-    })
-df = pd.DataFrame(rows).sort_values("FID@50")
-df.style.format({"FID@25": "{:.1f}", "FID@50": "{:.1f}",
-                 "G loss (ep50)": "{:.3f}", "D loss (ep50)": "{:.3f}"})
-"""),
-        code("""\
-fig, ax = plt.subplots(figsize=(10, 5))
-labels = df["Experiment"].values
-fid25  = df["FID@25"].values
-fid50  = df["FID@50"].values
-x = np.arange(len(labels)); w = 0.35
-
-ax.bar(x - w/2, fid25, w, label="FID @ 25", color="#5B8DB8", alpha=0.85)
-ax.bar(x + w/2, fid50, w, label="FID @ 50", color="#E8705A", alpha=0.85)
-ax.set_ylabel("FID (lower is better)")
-ax.set_title("Phase 1 - FID across all pipeline ablations")
-ax.set_xticks(x); ax.set_xticklabels(labels, rotation=30, ha="right")
-ax.legend(); plt.tight_layout(); plt.show()
-"""),
-        md("## 3. Controlled ablation results\n\nEach subplot holds the model approximately fixed and changes one pipeline factor. This is the decision evidence for the rest of the generator suite: alignment, resolution, augmentation, and dataset mixing are treated as pipeline choices, not as disconnected experiments.\n\n**What to look for:** a useful pipeline change should lower FID consistently inside its ablation group, not only produce one nicer-looking example."),
-        code("""\
-fig, axes = plt.subplots(2, 2, figsize=(14, 10))
-axes = axes.flatten()
-colors = ["#5B8DB8", "#E8705A"]
-
-for idx, (group_title, experiments) in enumerate(experiment_groups.items()):
-    ax = axes[idx]
-    for i, (run_name, label) in enumerate(experiments.items()):
-        epochs, fid_vals = fid_series(runs[run_name])
-        f50 = get_fid(runs[run_name], 50)
-        ax.plot(epochs, fid_vals, "o-",
-                label=f"{label} (FID@50={f50:.1f})",
-                color=colors[i], linewidth=2, markersize=8)
-    ax.set_xlabel("Epoch"); ax.set_ylabel("FID")
-    ax.set_title(group_title); ax.legend(fontsize=9)
-    ax.set_xlim(20, 55)
-fig.suptitle("FID per ablation group", fontsize=14, fontweight="bold", y=1.01)
-plt.tight_layout(); plt.show()
-"""),
-        md("""\
-## 4. Data pipeline visualization
-
-What each ablation actually changes, shown on the input data the model sees.
-These figures are not model outputs. They explain the input distribution that
-each model has to learn, which is why they sit next to the ablation results.
-"""),
-        code("""\
-import random
-from PIL import Image
-import torchvision.transforms as T
-
-random.seed(0)
-RAW     = PROJECT_ROOT / "data" / "wiki"
-ALIGNED = PROJECT_ROOT / "cropped" / "generator" / "wiki"
-
-def sample_paths(root, k=4):
-    if not root.exists():
-        print(f"Missing image directory: {root}")
-        return []
-    shards = [d for d in root.iterdir() if d.is_dir()]
-    files  = []
-    for s in random.sample(shards, min(8, len(shards))):
-        files += list(s.glob("*.jpg"))[:50]
-    return random.sample(files, min(k, len(files)))
-
-def matched_pairs(k=4):
-    if not RAW.exists() or not ALIGNED.exists():
-        print(f"Missing raw/aligned image directory: RAW={RAW.exists()} ALIGNED={ALIGNED.exists()}")
-        return []
-    shards = sorted(d.name for d in ALIGNED.iterdir() if d.is_dir() and (RAW / d.name).is_dir())
-    pairs  = []
-    for shard in random.sample(shards, min(8, len(shards))):
-        for ali in (ALIGNED / shard).glob("*.jpg"):
-            raw = RAW / shard / ali.name
-            if raw.exists():
-                pairs.append((raw, ali))
-            if len(pairs) >= 50: break
-        if len(pairs) >= 50: break
-    return random.sample(pairs, min(k, len(pairs)))
-
-def show(ax, img, title=None):
-    ax.imshow(img); ax.axis("off")
-    if title: ax.set_title(title, fontsize=9)
-
-def show_unavailable(ax, message):
-    ax.text(0.5, 0.5, message, ha="center", va="center", wrap=True, transform=ax.transAxes)
-    ax.axis("off")
-"""),
-        md("### 4A - Resolution\n\nSame raw image at 64x64 and 128x128. This is a paired comparison layout, so it keeps the original 2x4 format instead of being forced into a 4x4 sample grid.\n\n**Interpretation:** 128x128 carries more detail, but it also makes the proxy generator learn a harder distribution. The later decision favors 64x64 because stable face structure matters more than nominal resolution at this budget."),
-        code("""\
-paths = sample_paths(RAW, k=4)
-fig, axes = plt.subplots(2, 4, figsize=(12, 6))
-if not paths:
-    for ax in axes.ravel():
-        show_unavailable(ax, "raw images unavailable")
-else:
-    for col, p in enumerate(paths):
-        img = Image.open(p).convert("RGB")
-        show(axes[0][col], T.CenterCrop(min(img.size))(img).resize((64, 64)),  "64x64")
-        show(axes[1][col], T.CenterCrop(min(img.size))(img).resize((128, 128)), "128x128")
-fig.suptitle("1A - Resolution: same image at two scales", fontsize=12, fontweight="bold")
-plt.tight_layout(); plt.show()
-"""),
-        md("### 4B - Alignment\n\nRaw vs MTCNN-aligned 64x64 crops. This paired layout keeps the original 2x4 format so each raw image is directly above its aligned crop.\n\n**Interpretation:** alignment removes background and scale variation before the generator spends capacity on it. This is why alignment becomes the strongest pipeline lever."),
-        code("""\
-pairs = matched_pairs(k=4)
-fig, axes = plt.subplots(2, 4, figsize=(12, 6))
-if not pairs:
-    for ax in axes.ravel():
-        show_unavailable(ax, "matched raw/aligned crops unavailable")
-else:
-    for col, (raw_p, ali_p) in enumerate(pairs):
-        raw_img = Image.open(raw_p).convert("RGB")
-        show(axes[0][col], T.CenterCrop(min(raw_img.size))(raw_img).resize((128, 128)), "raw")
-        show(axes[1][col], Image.open(ali_p).convert("RGB"), "MTCNN-aligned")
-fig.suptitle("1B - Alignment: same source image, raw vs MTCNN-aligned", fontsize=12, fontweight="bold")
-plt.tight_layout(); plt.show()
-"""),
-        md("### 4C - Augmentation\n\nOne aligned image, then deterministic examples of the saved augmentation idea. This keeps the original compact strip because the point is to compare transforms on one image, not to make a generated 4x4 sample matrix.\n\n**Interpretation:** augmentation can make the training distribution broader, but it can also blur already scarce structure. Phase 1 treats it as a pipeline setting to justify, not as an automatic improvement."),
-        code("""\
-src = sample_paths(ALIGNED, k=1)
-if src:
-    img = Image.open(src[0]).convert("RGB").resize((128, 128))
-    none = T.Compose([])
-    hflip = T.Compose([T.RandomHorizontalFlip(p=1.0)])
-    full  = T.Compose([
-        T.RandomHorizontalFlip(p=1.0),
-        T.RandomRotation(degrees=5, interpolation=T.InterpolationMode.BILINEAR),
-        T.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.05),
-    ])
-    fig, axes = plt.subplots(1, 6, figsize=(15, 3))
-    show(axes[0], none(img),  "original")
-    show(axes[1], hflip(img), "hflip")
-    for i, ax in enumerate(axes[2:]):
-        show(ax, full(img), f"full aug #{i+1}")
-    fig.suptitle("1C - Augmentation: original vs hflip vs full augmentation", fontsize=12, fontweight="bold")
-    plt.tight_layout(); plt.show()
-else:
-    fig, ax = plt.subplots(figsize=(6, 2))
-    show_unavailable(ax, "aligned crop directory unavailable")
-    fig.suptitle("1C - Augmentation", fontsize=12, fontweight="bold")
-    plt.tight_layout(); plt.show()
-"""),
-        md("### 4D - Dataset mixing\n\nMixing raw and aligned images asks one generator to model two different input distributions. This keeps the original paired 2x4 layout so the contrast is easy to read.\n\n**Interpretation:** mixing increases nuisance variation and makes the generator solve two problems at once. The later phases therefore inherit aligned-only data."),
-        code("""\
-pairs = matched_pairs(k=4)
-fig, axes = plt.subplots(2, 4, figsize=(12, 6))
-if not pairs:
-    for ax in axes.ravel():
-        show_unavailable(ax, "matched raw/aligned crops unavailable")
-else:
-    for col, (raw_p, ali_p) in enumerate(pairs):
-        raw_img = Image.open(raw_p).convert("RGB")
-        show(axes[0][col], T.CenterCrop(min(raw_img.size))(raw_img).resize((128, 128)), "raw (mixed in)")
-        show(axes[1][col], Image.open(ali_p).convert("RGB"), "aligned")
-fig.suptitle("1D - Mixing: same source image, raw vs aligned", fontsize=12, fontweight="bold")
-plt.tight_layout(); plt.show()
-"""),
-        md(f"""\
-## 5. What this phase proves
-
-Lowest FID of Phase 1: **`{best_name}` at FID@50 = {best_val:.1f}**. These
-numbers rank pipeline choices only; the proxy generator is still not the final
-quality target.
-
-| Ablation | Winner | Winner FID@50 | Comparison | Interpretation |
-|---|---|---:|---|---|
-| 1A - Resolution | 64x64 | {get_fid(runs['p1a_dcgan_64'], 50):.1f} | 128x128: {get_fid(runs['p1a_dcgan_128'], 50):.1f} | 128x128 is too hard for this proxy budget |
-| 1B - Alignment | MTCNN-aligned | {get_fid(runs['p1b_dcgan_aligned'], 50):.1f} | full image: {get_fid(runs['p1b_dcgan_full'], 50):.1f} | alignment is the strongest lever |
-| 1C - Augmentation | H-flip + rotation + color | {get_fid(runs['p1c_dcgan_full_aug'], 50):.1f} | H-flip: {get_fid(runs['p1c_dcgan_hflip'], 50):.1f} | richer augmentation helps this proxy |
-| 1D - Dataset mixing | aligned only | {get_fid(runs['p1b_dcgan_aligned'], 50):.1f} | mixed: {get_fid(runs['p1d_dcgan_combined'], 50):.1f} | raw+aligned mixing increases distribution variance |
-
-**Decision:** carry forward MTCNN-aligned crops, 64x64 images, and aligned-only
-data. The saved later configs keep this pipeline and choose augmentation per
-model family.
-
-**Report conclusion:** Phase 1 turns the Phase 0 failure into a pipeline
-decision. Alignment is the main fix; Phase 2 can now focus on the GAN recipe
-instead of fighting raw-image variance.
-"""),
-    ]
-    write_nb(NOTEBOOK_SEQUENCE["phase1"], cells)
-
-
-# PHASE 2 - GAN architecture and objective evolution
-def build_phase2():
-    runs = {n: load(n) for n in ["p2_1_dcgan", "p2_2_wgan", "p2_3_wgan_sn_attn", "p2_4_wgan_sn_attn_128"]}
-    best_name = min(runs, key=lambda n: best_fid(runs[n])[1] or 9e9)
-    _, best_val = best_fid(runs[best_name])
-    p0_gan = load("p0_wgan")
-
-    cells = [
-        md(f"""\
-# 03 - GAN Stability Progression
-
-Phase 2 keeps the Phase 1 pipeline fixed and changes the GAN recipe. This makes
-the question narrow: once the data is aligned, what model changes are needed to
-escape collapse?
-
-The progression moves from the DCGAN proxy to Wasserstein training, then to the
-stability package that finally makes the samples recognizable.
-
-## What this phase changes
-
-| Run | Recipe change |
-|---|---|
-| `p2_1_dcgan` | DCGAN baseline under the Phase 2 protocol |
-| `p2_2_wgan` | BCE objective replaced by Wasserstein-GP |
-| `p2_3_wgan_sn_attn` | Spectral norm + GroupNorm + self-attention |
-| `p2_4_wgan_sn_attn_128` | Same stabilized recipe at 128x128 |
-
-**Headline result:** `{best_name}` reaches **best FID = {best_val:.1f}**. The
-critical step is not the objective change alone; it is the stabilized 64x64
-recipe with spectral normalization, GroupNorm, and self-attention.
-"""),
-        md("""\
-> ### FID is not comparable across phases
->
-> Phase 1's "best" was FID 33 (`p1c_dcgan_full_aug`). Phase 2's "best" is FID 110.
-> **This is not a regression.** The two numbers were computed under different
-> protocols:
->
-> - Phase 1 used a quick proxy FID for fast pipeline ablation, with a smaller
->   real-image reference set, on the un-augmented validation split.
-> - Phase 2 uses the project's standard FID protocol: 5000 aligned 64x64 real
->   images from the matched augmentation pipeline (`fid_n_real: 5000`).
->
-> Within Phase 2 the deltas are meaningful: 2.2 -> 2.3 is about **-311 FID**,
-> which is a real architecture jump. Don't compare phase 1 vs phase 2 numbers absolutely;
-> only compare within a phase, or against phase 5 which uses the same protocol.
-"""),
-        md(f"""\
-### Reference: Phase 0 baseline from the same family
-
-`p0_wgan` was the un-aligned, no-augmentation, basic-architecture WGAN-GP: face blobs
-with no recognisable features (no FID logged). Phase 2 below shows what happens once
-the pipeline is fixed and the model is allowed to evolve.
-"""),
-        code(SHARED_IMPORTS),
-        md("## 1. Load experiment logs\n\nOnly existing Phase 2 logs are loaded here. No training or re-evaluation is launched."),
-        code("""\
-run_names = ["p2_1_dcgan", "p2_2_wgan", "p2_3_wgan_sn_attn", "p2_4_wgan_sn_attn_128"]
-run_labels = {
-    "p2_1_dcgan":            "2.1 DCGAN (BCE)",
-    "p2_2_wgan":             "2.2 WGAN-GP",
-    "p2_3_wgan_sn_attn":     "2.3 + SN + Attn",
-    "p2_4_wgan_sn_attn_128": "2.4 + 128x128",
-}
-visual_notes = {
-    "p2_1_dcgan": "collapsed gray output",
-    "p2_2_wgan": "collapsed gray output",
-    "p2_3_wgan_sn_attn": "recognizable faces",
-    "p2_4_wgan_sn_attn_128": "under-trained 128x128",
-}
-runs = {name: load_log(name) for name in run_names}
-runs = {k: v for k, v in runs.items() if v}
-for n in run_names:
-    if n in runs: print(f"  {n}: {len(runs[n]['history']['g_loss'])} epochs")
-    else: print(f"  {n}: MISSING")
-"""),
-        md("## 2. FID comparison table\n\nThis table is the quantitative spine of the GAN progression: lower FID means generated samples are closer to the saved real reference distribution."),
-        code("""\
-rows = []
-for name in run_names:
-    if name not in runs: continue
-    r = runs[name]
-    epochs, fid_vals = fid_series(r)
-    best = min(fid_vals) if fid_vals else None
-    rows.append({
-        "Run":    run_labels[name],
-        "FID@25": get_fid(r, 25),
-        "FID@50": get_fid(r, 50),
-        "FID@100": get_fid(r, 100),
-        "Best FID": best,
-        "Train (min)": (r['history'].get('train_time_s') or 0) / 60,
-    })
-df = pd.DataFrame(rows).sort_values("Best FID")
-df.style.format({"FID@25": "{:.1f}", "FID@50": "{:.1f}", "FID@100": "{:.1f}",
-                 "Best FID": "{:.1f}", "Train (min)": "{:.1f}"})
-"""),
-        md("## 3. FID curves - progression\n\nThis plot shows whether improvements happen gradually or as a step change. It is needed because the final FID table hides training dynamics: here the key story is that the 2.3 stability package changes the whole trajectory, while 2.1 and 2.2 remain collapsed."),
-        code("""\
-fig, ax = plt.subplots(figsize=(10, 5))
-cmap = plt.cm.viridis
-for i, name in enumerate(run_names):
-    if name not in runs: continue
-    epochs, fid_vals = fid_series(runs[name])
-    ax.plot(epochs, fid_vals, "o-", label=run_labels[name],
-            color=cmap(i / len(run_names)), linewidth=2, markersize=7)
-ax.set_xlabel("Epoch"); ax.set_ylabel("FID")
-ax.set_title("Phase 2 - FID curves")
-ax.legend(); plt.tight_layout(); plt.show()
-"""),
-        md("## 4. Training dynamics\n\nThe loss curves help explain why the visual jump happens. The objective alone is unstable; the normalized, attention-equipped recipe is where training becomes useful."),
-        code("""\
-fig, axes = plt.subplots(1, 2, figsize=(14, 5))
-cmap = plt.cm.viridis
-for i, name in enumerate(run_names):
-    if name not in runs: continue
-    h = runs[name]["history"]
-    color = cmap(i / len(run_names))
-    epochs = range(1, len(h["g_loss"]) + 1)
-    axes[0].plot(epochs, h["g_loss"], color=color, label=run_labels[name], linewidth=1.2)
-    if "w_dist" in h:
-        axes[1].plot(epochs, h["w_dist"], color=color, label=run_labels[name], linewidth=1.2)
-    elif "d_loss" in h:
-        axes[1].plot(epochs, h["d_loss"], color=color, label=run_labels[name], linewidth=1.2, linestyle="--")
-axes[0].set_title("Generator loss"); axes[0].set_xlabel("Epoch"); axes[0].legend(fontsize=8)
-axes[1].set_title("Wasserstein distance / D loss"); axes[1].set_xlabel("Epoch"); axes[1].legend(fontsize=8)
-plt.tight_layout(); plt.show()
-"""),
-        md("""\
-## 5. Sample grids - epoch 100
-
-- **2.1 and 2.2 collapsed** - the gray cells are the actual saved generator outputs,
-  not missing images. The black lines are just grid separators. Vanilla DCGAN/WGAN-GP
-  at this scale is still too weak, and their FIDs (>400) confirm the failure.
-- **2.3 is the breakthrough** - spectral norm + GroupNorm + self-attention escape
-  mode collapse and produce diverse, recognisable faces.
-- **2.4 (128x128) regresses** - same architecture at higher resolution at fixed
-  compute under-trains.
-"""),
-        code("""\
-fig, axes = plt.subplots(1, 4, figsize=(16, 4.5))
-for ax, name in zip(axes, run_names):
-    img_path = SAMPLES / name / "epoch_0100.png"
-    f100 = get_fid(runs.get(name, {}), 100) if name in runs else None
-    title = f"{run_labels[name]}\\nFID@100={f100:.1f}" if f100 else run_labels[name]
-    title += f"\\n{visual_notes.get(name, '')}"
-    show_image_or_missing(ax, img_path, title)
-plt.tight_layout(); plt.show()
-"""),
-        md("## 6. Progression - epoch 10 -> 50 -> 100\n\nThese panels connect time to visual quality. For the collapsed runs, the gray grids are still information: they show that more epochs did not fix the recipe. For the stabilized run, the same timeline shows recognizable faces emerging."),
-        code("""\
-check_epochs = [10, 50, 100]
-for name in run_names:
-    if name not in runs: continue
-    fig, axes = plt.subplots(1, len(check_epochs), figsize=(12, 4))
-    for ax, e in zip(axes, check_epochs):
-        p = SAMPLES / name / f"epoch_{e:04d}.png"
-        f = get_fid(runs[name], e)
-        title = f"epoch {e}" + (f"\\nFID={f:.1f}" if f else "")
-        if name in {"p2_1_dcgan", "p2_2_wgan"}:
-            title += "\\ncollapsed output"
-        show_image_or_missing(ax, p, title)
-    fig.suptitle(run_labels[name], fontsize=11, fontweight="bold")
-    plt.tight_layout(); plt.show()
-"""),
-        md("## 7. Pairwise comparison - what each step bought us\n\nEach pair isolates one decision. The purpose is to avoid saying simply that the final GAN is better: the comparison shows that Wasserstein loss alone is insufficient, the stability package is decisive, and 128x128 is premature under the saved compute budget."),
-        code("""\
-transitions = [
-    ("2.1 -> 2.2: BCE -> Wasserstein",      "p2_1_dcgan",         "p2_2_wgan"),
-    ("2.2 -> 2.3: + SN + GroupNorm + Attn", "p2_2_wgan",          "p2_3_wgan_sn_attn"),
-    ("2.3 -> 2.4: 64 -> 128 resolution",    "p2_3_wgan_sn_attn",  "p2_4_wgan_sn_attn_128"),
-]
-for title, a, b in transitions:
-    if a not in runs or b not in runs: continue
-    fig, axes = plt.subplots(1, 2, figsize=(10, 5))
-    for ax, name in zip(axes, [a, b]):
-        img_path = SAMPLES / name / "epoch_0100.png"
-        f = get_fid(runs[name], 100)
-        title_text = f"{run_labels[name]}\\nFID@100={f:.1f}" if f else run_labels[name]
-        title_text += f"\\n{visual_notes.get(name, '')}"
-        show_image_or_missing(ax, img_path, title_text)
-    fig.suptitle(title, fontsize=12, fontweight="bold")
-    plt.tight_layout(); plt.show()
-"""),
-        md(f"""\
-## 8. What this phase proves
-
-| Step | Run | Best FID | Delta vs previous |
-|---|---|---:|---:|
-| 2.1 DCGAN baseline | `p2_1_dcgan` | {best_fid(runs['p2_1_dcgan'])[1]:.1f} | n/a |
-| 2.2 + Wasserstein-GP | `p2_2_wgan` | {best_fid(runs['p2_2_wgan'])[1]:.1f} | {best_fid(runs['p2_2_wgan'])[1] - best_fid(runs['p2_1_dcgan'])[1]:+.1f} |
-| 2.3 + SN + GroupNorm + attention | `p2_3_wgan_sn_attn` | {best_fid(runs['p2_3_wgan_sn_attn'])[1]:.1f} | {best_fid(runs['p2_3_wgan_sn_attn'])[1] - best_fid(runs['p2_2_wgan'])[1]:+.1f} |
-| 2.4 + 128x128 | `p2_4_wgan_sn_attn_128` | {best_fid(runs['p2_4_wgan_sn_attn_128'])[1]:.1f} | {best_fid(runs['p2_4_wgan_sn_attn_128'])[1] - best_fid(runs['p2_3_wgan_sn_attn'])[1]:+.1f} |
-
-Changing the loss from BCE to Wasserstein-GP is not enough by itself. The
-breakthrough is the combined stability recipe in 2.3: spectral normalization,
-GroupNorm, and self-attention. The 128x128 run then regresses under the saved
-compute budget.
-
-**Decision:** select the 64x64 WGAN-GP recipe with spectral normalization,
-GroupNorm, and self-attention as the GAN representative for the final
-comparison.
-
-**Report conclusion:** Phase 2 turns the GAN from a collapsing proxy into a
-usable generator recipe, but it also shows that higher resolution is not helpful
-without enough training budget.
-"""),
-    ]
-    write_nb(NOTEBOOK_SEQUENCE["phase2"], cells)
-
-
-# PHASE 3 - VAE composite-loss evolution
-def build_phase3():
-    runs = {n: load(n) for n in ["p3_1_vae", "p3_2_vae_perceptual", "p3_3_vae_patchgan"]}
-    best_name = min(runs, key=lambda n: best_fid(runs[n])[1] or 9e9)
-    _, best_val = best_fid(runs[best_name])
-
-    cells = [
-        md(f"""\
-# 04 - VAE Loss Progression
-
-Phase 3 studies the VAE family after the pipeline has been locked. The baseline
-VAE is fast and stable, but its MSE + KL objective tends to average away facial
-detail.
-
-This phase asks whether extra loss terms can recover sharper, more realistic
-samples. The saved runs stack the losses one step at a time.
-
-## What this phase changes
-
-| Run | Recipe change |
-|---|---|
-| `p3_1_vae` | Baseline MSE + KL |
-| `p3_2_vae_perceptual` | Adds VGG16 perceptual loss (`lambda_perceptual=0.1`) |
-| `p3_3_vae_patchgan` | Adds PatchGAN adversarial loss (`lambda_adversarial=0.01`) |
-
-**Headline result:** `{best_name}` reaches **best FID = {best_val:.1f}** on
-prior samples. The important result is the sequence: the added losses are
-complementary, not interchangeable.
-"""),
-        md("""\
-### Reference: Phase 0 baseline from the same family
-
-`p0_vae` was MSE+KL on raw un-aligned data. Prior samples were heavily blurred
-mean-faces, the textbook VAE failure mode. Phase 3 keeps the encoder/decoder
-architecture and pipeline fixed and shows that adding perceptual + adversarial
-terms is what actually moves the needle.
-"""),
-        code(SHARED_IMPORTS),
-        md("## 1. Load experiment logs\n\nThe notebook loads the existing VAE logs and saved previews only."),
-        code("""\
-run_names = ["p3_1_vae", "p3_2_vae_perceptual", "p3_3_vae_patchgan"]
-run_labels = {
-    "p3_1_vae":            "3.1 MSE + KL",
-    "p3_2_vae_perceptual": "3.2 + Perceptual",
-    "p3_3_vae_patchgan":   "3.3 + PatchGAN",
-}
-runs = {name: load_log(name) for name in run_names}
-runs = {k: v for k, v in runs.items() if v}
-for n in run_names: print(f"  {n}: {'OK' if n in runs else 'MISSING'}")
-"""),
-        md("## 2. FID comparison table (prior samples)\n\nFID is computed on samples decoded from the prior, so it evaluates generation quality rather than only reconstruction quality."),
-        code("""\
-rows = []
-for name in run_names:
-    if name not in runs: continue
-    r = runs[name]; h = r["history"]
-    _, fid_vals = fid_series(r)
-    rows.append({
-        "Run":   run_labels[name],
-        "FID@50":  get_fid(r, 50),
-        "FID@100": get_fid(r, 100),
-        "Best FID": min(fid_vals) if fid_vals else None,
-        "Recon@100": h["recon_loss"][-1] if h.get("recon_loss") else None,
-        "KL@100":    h["kl_loss"][-1]    if h.get("kl_loss")    else None,
-        "Train (min)": (h.get("train_time_s") or 0) / 60,
-    })
-df = pd.DataFrame(rows).sort_values("Best FID")
-df.style.format({"FID@50": "{:.1f}", "FID@100": "{:.1f}", "Best FID": "{:.1f}",
-                 "Recon@100": "{:.4f}", "KL@100": "{:.2f}", "Train (min)": "{:.1f}"})
-"""),
-        md("## 3. FID curves - progression\n\nThe curves show how each extra loss changes the generation trajectory, not just the final checkpoint. A useful VAE loss should improve prior-sample FID while preserving the stable behavior that makes VAEs attractive."),
-        code("""\
-fig, ax = plt.subplots(figsize=(10, 5))
-cmap = plt.cm.plasma
-for i, name in enumerate(run_names):
-    if name not in runs: continue
-    epochs, fid_vals = fid_series(runs[name])
-    ax.plot(epochs, fid_vals, "o-", label=run_labels[name],
-            color=cmap(i / len(run_names)), linewidth=2, markersize=7)
-ax.set_xlabel("Epoch"); ax.set_ylabel("FID (prior samples)")
-ax.set_title("Phase 3 - FID curves"); ax.legend()
-plt.tight_layout(); plt.show()
-"""),
-        md("## 4. Training loss components\n\nThe component losses explain the tradeoff: reconstruction and KL preserve the VAE structure, perceptual loss encourages visual detail, and the PatchGAN term pushes samples toward the face manifold."),
-        code("""\
-fig, axes = plt.subplots(1, 3, figsize=(16, 4))
-cmap = plt.cm.plasma
-for i, name in enumerate(run_names):
-    if name not in runs: continue
-    h = runs[name]["history"]; color = cmap(i / len(run_names))
-    epochs = range(1, len(h["recon_loss"]) + 1)
-    axes[0].plot(epochs, h["recon_loss"], color=color, label=run_labels[name])
-    axes[1].plot(epochs, h["kl_loss"],    color=color, label=run_labels[name])
-    if "perc_loss" in h and any(h["perc_loss"]):
-        axes[2].plot(epochs, h["perc_loss"], color=color, label=run_labels[name])
-axes[0].set_title("Reconstruction (MSE)"); axes[0].set_xlabel("Epoch"); axes[0].legend(fontsize=8)
-axes[1].set_title("KL divergence");        axes[1].set_xlabel("Epoch"); axes[1].legend(fontsize=8)
-axes[2].set_title("Perceptual (VGG16)");   axes[2].set_xlabel("Epoch"); axes[2].legend(fontsize=8)
-plt.tight_layout(); plt.show()
-"""),
-        md("## 5. Prior samples - epoch 100\n\nThese are generated samples from the latent prior, so they answer the true generator question: if we sample a random latent vector, do we get plausible faces? This is different from reconstruction quality."),
-        code("""\
-fig, axes = plt.subplots(1, 3, figsize=(13, 4.5))
-for ax, name in zip(axes, run_names):
-    img_path = SAMPLES / name / "epoch_0100.png"
-    if img_path.exists():
-        ax.imshow(mpimg.imread(str(img_path)))
-        f = get_fid(runs.get(name, {}), 100) if name in runs else None
-        ax.set_title(f"{run_labels[name]}\\nFID@100={f:.1f}" if f else run_labels[name], fontsize=10)
-    ax.axis("off")
-fig.suptitle("Prior samples (decoded from N(0, I))", fontsize=12, fontweight="bold")
-plt.tight_layout(); plt.show()
-"""),
-        md("## 6. Reconstructions - epoch 100\n\nReconstructions show whether the encoder-decoder still preserves real input structure. They are useful as a diagnostic, but they are not the final generation metric because reconstructing a known image is easier than sampling a new one."),
-        code("""\
-fig, axes = plt.subplots(1, 3, figsize=(13, 4.5))
-for ax, name in zip(axes, run_names):
-    img_path = SAMPLES / name / "epoch_0100_recon.png"
-    if img_path.exists():
-        ax.imshow(mpimg.imread(str(img_path)))
-        ax.set_title(run_labels[name], fontsize=10)
-    ax.axis("off")
-fig.suptitle("Reconstructions (real / decoded interleaved)", fontsize=12, fontweight="bold")
-plt.tight_layout(); plt.show()
-"""),
-        md("## 7. Progression - epoch 10 -> 50 -> 100 (prior samples)\n\nThe timeline shows how the sampled faces change as the loss stack trains. The limitation remains visible: the VAE becomes more structured and detailed, but it still tends toward smoother faces than GAN or DDPM samples."),
-        code("""\
-check_epochs = [10, 50, 100]
-for name in run_names:
-    if name not in runs: continue
-    fig, axes = plt.subplots(1, len(check_epochs), figsize=(12, 4))
-    for ax, e in zip(axes, check_epochs):
-        p = SAMPLES / name / f"epoch_{e:04d}.png"
-        if p.exists():
-            ax.imshow(mpimg.imread(str(p)))
-            f = get_fid(runs[name], e)
-            ax.set_title(f"epoch {e}" + (f"\\nFID={f:.1f}" if f else ""), fontsize=9)
-        ax.axis("off")
-    fig.suptitle(run_labels[name], fontsize=11, fontweight="bold")
-    plt.tight_layout(); plt.show()
-"""),
-        md(f"""\
-## 8. What this phase proves
-
-| Step | Run | Best FID | Delta vs previous |
-|---|---|---:|---:|
-| 3.1 MSE + KL | `p3_1_vae` | {best_fid(runs['p3_1_vae'])[1]:.1f} | n/a |
-| 3.2 + perceptual | `p3_2_vae_perceptual` | {best_fid(runs['p3_2_vae_perceptual'])[1]:.1f} | {best_fid(runs['p3_2_vae_perceptual'])[1] - best_fid(runs['p3_1_vae'])[1]:+.1f} |
-| 3.3 + PatchGAN | `p3_3_vae_patchgan` | {best_fid(runs['p3_3_vae_patchgan'])[1]:.1f} | {best_fid(runs['p3_3_vae_patchgan'])[1] - best_fid(runs['p3_2_vae_perceptual'])[1]:+.1f} |
-
-The losses add different kinds of pressure. MSE + KL keeps the latent model
-stable but blurry. Perceptual loss restores more visual texture. PatchGAN adds
-an adversarial signal that makes the prior samples look more face-like.
-
-**Decision:** select the composite VAE recipe for the final comparison:
-MSE + 0.25 KL + 0.1 VGG perceptual + 0.01 PatchGAN adversarial.
-
-**Report conclusion:** Phase 3 shows that the VAE can be improved through
-complementary losses, but even the selected recipe remains a speed-oriented
-family rather than the strongest quality candidate.
-"""),
-    ]
-    write_nb(NOTEBOOK_SEQUENCE["phase3"], cells)
-
-
-# PHASE 4 - DDPM schedule, target, and width evolution
-def build_phase4():
-    runs = {n: load(n) for n in ["p4_1_ddpm_linear", "p4_2_ddpm_cosine",
-                                  "p4_3_ddpm_vpred", "p4_4_ddpm_wider"]}
-    best_name = min(runs, key=lambda n: best_fid(runs[n])[1] or 9e9)
-    _, best_val = best_fid(runs[best_name])
-
-    cells = [
-        md(f"""\
-# 05 - DDPM Recipe Progression
-
-Phase 4 applies the same report logic to diffusion models. The pipeline is
-already fixed, so this notebook isolates the DDPM recipe: schedule, prediction
-target, and backbone width.
-
-The story is stepwise. A cosine schedule helps, v-prediction is the major gain,
-and the wider backbone becomes useful only after the target and schedule are
-improved.
-
-## What this phase changes
-
-| Run | Recipe change |
-|---|---|
-| `p4_1_ddpm_linear` | Linear noise schedule, epsilon-prediction |
-| `p4_2_ddpm_cosine` | Cosine noise schedule |
-| `p4_3_ddpm_vpred` | v-prediction target |
-| `p4_4_ddpm_wider` | Wider U-Net: base channels 192 with attention at 32/16/8 |
-
-Sampling previews use DDIM-50. Logged FID uses DDIM-100 against the saved real
-reference set.
-
-**Headline result:** `{best_name}` reaches **best FID = {best_val:.1f}**.
-
-## How to read DDPM sample grids
-
-The DDPM grids should not be read as the same faces improving from epoch to
-epoch. GAN and VAE previews can reuse a fixed latent grid, so each position can
-look like the same latent code becoming sharper over training. A DDPM preview
-starts from noise and runs a stochastic reverse-diffusion sampler. Unless the
-exact initial noise and sampler randomness are fixed and stored, each epoch
-preview is a fresh draw from the model.
-
-So for DDPM, the progression panels show distribution-level improvement:
-cleaner faces, fewer artifacts, and better global structure. They are not
-identity-by-identity refinements of the same preview images.
-"""),
-        md("""\
-### Reference: Phase 0 baseline from the same family
-
-`p0_ddpm` was a vanilla DDPM (linear schedule, epsilon-prediction, base_ch=128) on raw
-un-aligned data. Outputs were noisy face-shaped textures. Phase 4 fixes the
-pipeline (aligned 64) and walks through the standard set of post-2020 DDPM
-improvements one at a time.
-"""),
-        code(SHARED_IMPORTS),
-        md("## 1. Load experiment logs\n\nThe notebook reads existing DDPM logs only. Sampling and FID values are already saved."),
-        code("""\
-run_names = ["p4_1_ddpm_linear", "p4_2_ddpm_cosine", "p4_3_ddpm_vpred", "p4_4_ddpm_wider"]
-run_labels = {
-    "p4_1_ddpm_linear": "4.1 linear / epsilon",
-    "p4_2_ddpm_cosine": "4.2 cosine / epsilon",
-    "p4_3_ddpm_vpred":  "4.3 cosine / v",
-    "p4_4_ddpm_wider":  "4.4 wider net",
-}
-runs = {n: load_log(n) for n in run_names}
-runs = {k: v for k, v in runs.items() if v}
-for n in run_names: print(f"  {n}: {'OK' if n in runs else 'MISSING'}")
-"""),
-        md("## 2. FID comparison table\n\nThe table shows whether each recipe change improves generation quality under the saved DDIM-100 FID protocol."),
-        code("""\
-rows = []
-for name in run_names:
-    if name not in runs: continue
-    r = runs[name]; _, fid_vals = fid_series(r)
-    rows.append({
-        "Run": run_labels[name],
-        "FID@25":  get_fid(r, 25),
-        "FID@50":  get_fid(r, 50),
-        "FID@100": get_fid(r, 100),
-        "Best FID": min(fid_vals) if fid_vals else None,
-        "Loss@100": r["history"]["loss"][-1],
-        "Train (min)": (r['history'].get('train_time_s') or 0) / 60,
-    })
-df = pd.DataFrame(rows).sort_values("Best FID")
-df.style.format({"FID@25": "{:.1f}", "FID@50": "{:.1f}", "FID@100": "{:.1f}",
-                 "Best FID": "{:.1f}", "Loss@100": "{:.4f}", "Train (min)": "{:.1f}"})
-"""),
-        md("## 3. FID curves - progression\n\nThe curves make the DDPM recipe evolution visible. The main evidence is not only that the wider final model wins, but that the big jump happens when the prediction target changes to v-prediction."),
-        code("""\
-fig, ax = plt.subplots(figsize=(10, 5))
-cmap = plt.cm.cividis
-for i, name in enumerate(run_names):
-    if name not in runs: continue
-    epochs, fid_vals = fid_series(runs[name])
-    ax.plot(epochs, fid_vals, "o-", label=run_labels[name],
-            color=cmap(i / len(run_names)), linewidth=2, markersize=7)
-ax.set_xlabel("Epoch"); ax.set_ylabel("FID (DDIM-100)")
-ax.set_title("Phase 4 - FID curves"); ax.legend()
-plt.tight_layout(); plt.show()
-"""),
-        md("## 4. Training loss\n\nThe loss plot is diagnostic, but epsilon-MSE and v-MSE are different targets. FID and sample grids carry the decision."),
-        code("""\
-fig, ax = plt.subplots(figsize=(10, 4))
-cmap = plt.cm.cividis
-for i, name in enumerate(run_names):
-    if name not in runs: continue
-    h = runs[name]["history"]
-    epochs = range(1, len(h["loss"]) + 1)
-    ax.plot(epochs, h["loss"], color=cmap(i / len(run_names)), label=run_labels[name], linewidth=1.3)
-ax.set_xlabel("Epoch"); ax.set_ylabel("MSE on prediction target")
-ax.set_title("Loss (epsilon-MSE and v-MSE are not directly comparable)")
-ax.legend(); plt.tight_layout(); plt.show()
-"""),
-        md("## 5. Sample grids - epoch 100\n\nThese grids show the qualitative side of the FID drop. They should be read as independent samples from each checkpoint, with attention to global face structure, texture noise, and artifact frequency."),
-        code("""\
-fig, axes = plt.subplots(1, 4, figsize=(16, 4.5))
-for ax, name in zip(axes, run_names):
-    img_path = SAMPLES / name / "epoch_0100.png"
-    if img_path.exists():
-        ax.imshow(mpimg.imread(str(img_path)))
-        f = get_fid(runs.get(name, {}), 100) if name in runs else None
-        ax.set_title(f"{run_labels[name]}\\nFID@100={f:.1f}" if f else run_labels[name], fontsize=9)
-    ax.axis("off")
-plt.tight_layout(); plt.show()
-"""),
-        md("## 6. Progression - epoch 10 -> 50 -> 100\n\nRead these as fresh samples from each checkpoint, not the same DDPM images being refined over time."),
-        code("""\
-check_epochs = [10, 50, 100]
-for name in run_names:
-    if name not in runs: continue
-    fig, axes = plt.subplots(1, len(check_epochs), figsize=(12, 4))
-    for ax, e in zip(axes, check_epochs):
-        p = SAMPLES / name / f"epoch_{e:04d}.png"
-        if p.exists():
-            ax.imshow(mpimg.imread(str(p)))
-            f = get_fid(runs[name], e)
-            ax.set_title(f"epoch {e}" + (f"\\nFID={f:.1f}" if f else ""), fontsize=9)
-        ax.axis("off")
-    fig.suptitle(run_labels[name], fontsize=11, fontweight="bold")
-    plt.tight_layout(); plt.show()
-"""),
-        md("## 7. Noise schedule visualization\n\nThe cosine schedule preserves useful signal more smoothly across timesteps. That gives the model a better learning problem before v-prediction and width are added."),
-        code("""\
-import math
-T = 1000; t = np.arange(T)
-betas_lin = np.linspace(1e-4, 0.02, T)
-ab_lin    = np.cumprod(1 - betas_lin)
-s = 0.008
-f = np.cos((t / T + s) / (1 + s) * math.pi / 2) ** 2
-f = f / f[0]
-betas_cos = np.clip(1 - f[1:] / f[:-1], 0, 0.999)
-ab_cos    = np.cumprod(1 - betas_cos)
-
-fig, axes = plt.subplots(1, 2, figsize=(13, 4))
-axes[0].plot(t, ab_lin, label="linear", color="#5B8DB8", linewidth=2)
-axes[0].plot(t[:len(ab_cos)], ab_cos, label="cosine", color="#E8705A", linewidth=2)
-axes[0].set_xlabel("Timestep t"); axes[0].set_ylabel("alpha_bar_t (signal fraction)")
-axes[0].set_title("Cumulative signal preservation"); axes[0].legend()
-axes[1].plot(betas_lin, label="linear beta", color="#5B8DB8", linewidth=2)
-axes[1].plot(betas_cos, label="cosine beta", color="#E8705A", linewidth=2)
-axes[1].set_xlabel("Timestep t"); axes[1].set_ylabel("beta_t"); axes[1].set_title("beta schedule"); axes[1].legend()
-plt.tight_layout(); plt.show()
-"""),
-        md(f"""\
-## 8. What this phase proves
-
-| Step | Run | Best FID | Delta vs previous |
-|---|---|---:|---:|
-| 4.1 linear / epsilon | `p4_1_ddpm_linear` | {best_fid(runs['p4_1_ddpm_linear'])[1]:.1f} | n/a |
-| 4.2 cosine / epsilon | `p4_2_ddpm_cosine` | {best_fid(runs['p4_2_ddpm_cosine'])[1]:.1f} | {best_fid(runs['p4_2_ddpm_cosine'])[1] - best_fid(runs['p4_1_ddpm_linear'])[1]:+.1f} |
-| 4.3 cosine / v | `p4_3_ddpm_vpred` | {best_fid(runs['p4_3_ddpm_vpred'])[1]:.1f} | {best_fid(runs['p4_3_ddpm_vpred'])[1] - best_fid(runs['p4_2_ddpm_cosine'])[1]:+.1f} |
-| 4.4 wider net | `p4_4_ddpm_wider` | {best_fid(runs['p4_4_ddpm_wider'])[1]:.1f} | {best_fid(runs['p4_4_ddpm_wider'])[1] - best_fid(runs['p4_3_ddpm_vpred'])[1]:+.1f} |
-
-The largest improvement is v-prediction. The wider network then helps because
-the schedule and prediction target have already made the learning problem
-better aligned with sample quality.
-
-**Decision:** select the DDPM recipe with cosine schedule, v-prediction,
-base_ch=192, and attention at 32/16/8 for the final comparison.
-
-**Report conclusion:** Phase 4 turns DDPM from the textured but noisy baseline
-into the strongest quality candidate for Phase 5.
-"""),
-    ]
-    write_nb(NOTEBOOK_SEQUENCE["phase4"], cells)
-
-
-# PHASE 5 - Cross-family final comparison
-def build_phase5():
-    p5 = {n: load(n) for n in ["p5_gan", "p5_vae", "p5_ddpm"]}
-    rows = []
-    for n, log in p5.items():
-        e, v = best_fid(log)
-        rows.append((n, v, e, time_min(log)))
-    rows.sort(key=lambda r: r[1] or 9e9)
-    headline = ", ".join(f"{n}={v:.1f}" for n, v, _, _ in rows)
-
-    cells = [
-        md(f"""\
-# 06 - Final Family Comparison
-
-Phase 5 is the project-level comparison. It loads the already trained best
-recipes from the GAN, VAE, and DDPM branches and compares their saved logs,
-sample grids, and checkpoint-based interpolation diagnostics.
-
-The earlier notebooks explain how each recipe was selected. This notebook asks
-the practical question: which family gives the best quality, which is fastest,
-and which one should the project recommend overall?
-
-## What this phase changes
-
-Nothing new is trained here. The comparison uses the saved Phase 5 artifacts:
-
-| Family | Selected recipe |
-|---|---|
-| GAN | WGAN-GP with spectral norm, GroupNorm, and self-attention |
-| VAE | MSE + KL + perceptual + PatchGAN losses |
-| DDPM | Cosine schedule + v-prediction + wider U-Net |
-
-**Headline FIDs from saved logs:** {headline}.
-"""),
-        code(SHARED_IMPORTS + """
-
-FAMILIES = {
-    "GAN":  {"p5": "p5_gan",  "color": "#5B8DB8", "label": "WGAN-GP + SN + Attn"},
-    "VAE":  {"p5": "p5_vae",  "color": "#E8B85A", "label": "VAE + Perceptual + PatchGAN"},
-    "DDPM": {"p5": "p5_ddpm", "color": "#E8705A", "label": "DDPM cosine v-pred wider"},
-}
-
-logs_p5 = {fam: load_log(info["p5"]) for fam, info in FAMILIES.items()}
-"""),
-        md("## 1. Quantitative summary\n\nThe table compares the saved best-of-family runs under the same Phase 5 setup. Lower FID is better; training time gives the speed side of the tradeoff."),
-        code("""\
-rows = []
-for fam, info in FAMILIES.items():
-    log = logs_p5[fam]
-    if log is None: continue
-    h   = log["history"]; cfg = log["config"]
-    _, fid_vals = fid_series(log)
-    rows.append({
-        "Family":       fam,
-        "Architecture": info["label"],
-        "Resolution":   f"{cfg.get('image_size')}x{cfg.get('image_size')}",
-        "Epochs":       len(h.get("loss") or h.get("g_loss") or h.get("recon_loss") or []),
-        "Best FID":     min(fid_vals) if fid_vals else None,
-        "Last FID":     fid_vals[-1] if fid_vals else None,
-        "Train (min)":  (h.get("train_time_s") or 0) / 60,
-    })
-df = pd.DataFrame(rows).sort_values("Best FID")
-df.style.format({"Best FID": "{:.1f}", "Last FID": "{:.1f}", "Train (min)": "{:.1f}"})
-"""),
-        md("## 2. Quality vs training time\n\nThis post-hoc plot uses only the existing log summaries. It makes the practical decision visible: DDPM is best by FID, GAN is the stronger speed-quality compromise, and VAE is fastest but behind in sample quality."),
-        code("""\
-fig, ax = plt.subplots(figsize=(7, 4.8))
-plot_df = df.copy()
-for _, row in plot_df.iterrows():
-    fam = row["Family"]
-    info = FAMILIES[fam]
-    ax.scatter(row["Train (min)"], row["Best FID"], s=120, color=info["color"], label=fam)
-    ax.text(row["Train (min)"] + 1.0, row["Best FID"], fam, va="center", fontsize=10)
-ax.set_xlabel("Training time (min)")
-ax.set_ylabel("Best FID (lower is better)")
-ax.set_title("Final comparison: quality vs training time")
-ax.grid(alpha=0.25)
-plt.tight_layout(); plt.show()
-"""),
-        md("## 3. FID curves - all three families\n\nThis plot puts the selected family recipes on one timeline. It is needed because the best final FID alone does not show convergence behavior: DDPM reaches the best quality, GAN remains close with less time, and VAE is fast but saturates at a higher FID."),
-        code("""\
-fig, ax = plt.subplots(figsize=(10, 5))
-for fam, info in FAMILIES.items():
-    log = logs_p5[fam]
-    if log is None: continue
-    epochs, vals = fid_series(log)
-    ax.plot(epochs, vals, "o-", color=info["color"], label=f"{fam} ({info['label']})",
-            linewidth=2, markersize=7)
-ax.set_xlabel("Epoch"); ax.set_ylabel("FID")
-ax.set_title("Phase 5 - FID across families")
-ax.legend(); plt.tight_layout(); plt.show()
-"""),
-        md("## 4. Best-epoch sample grids\n\nThe grids support the numeric ranking with visible sample quality. DDPM and GAN produce sharper, more plausible faces than the VAE prior samples."),
-        code("""\
-fig, axes = plt.subplots(1, 3, figsize=(16, 6))
-for ax, (fam, info) in zip(axes, FAMILIES.items()):
-    log = logs_p5[fam]
-    samples_dir = SAMPLES / info["p5"]
-    pngs = sorted(samples_dir.glob("epoch_*.png"))
-    pngs = [p for p in pngs if "_recon" not in p.stem]
-    if not pngs:
-        ax.set_title(f"{fam} (no samples)"); ax.axis("off"); continue
-    img_path = pngs[-1]  # last preview = closest to final_ema
-    ax.imshow(mpimg.imread(str(img_path)))
-    e, v = (None, None)
-    if log:
-        _, fid_vals = fid_series(log)
-        v = min(fid_vals) if fid_vals else None
-    ax.set_title(f"{fam} - {info['label']}\\n{img_path.stem}" + (f"  best FID={v:.1f}" if v else ""), fontsize=10)
-    ax.axis("off")
-fig.suptitle("Final samples from each family", fontsize=13, fontweight="bold")
-plt.tight_layout(); plt.show()
-"""),
-        md("## 5. Training progression - early -> late\n\nFor GAN and VAE runs, preview grids may reuse a fixed latent layout, so positions can feel like they improve over time. DDPM previews are different: each grid is a fresh stochastic sample from that checkpoint unless the exact starting noise was fixed and stored. Read the DDPM row as distribution quality improving, not the same faces being polished."),
-        code("""\
-for fam, info in FAMILIES.items():
-    log = logs_p5[fam]
-    samples_dir = SAMPLES / info["p5"]
-    pngs = sorted(p for p in samples_dir.glob("epoch_*.png") if "_recon" not in p.stem)
-    if not pngs: continue
-    # Pick ~4 evenly spaced previews
-    n_pick = min(4, len(pngs))
-    picks  = [pngs[i * (len(pngs) - 1) // (n_pick - 1)] for i in range(n_pick)] if n_pick > 1 else pngs
-    fig, axes = plt.subplots(1, len(picks), figsize=(4 * len(picks), 4))
-    if len(picks) == 1: axes = [axes]
-    for ax, p in zip(axes, picks):
-        ax.imshow(mpimg.imread(str(p)))
-        ep = int(p.stem.split("_")[1])
-        f = get_fid(log, ep) if log else None
-        ax.set_title(f"epoch {ep}" + (f"\\nFID={f:.1f}" if f else ""), fontsize=9)
-        ax.axis("off")
-    fig.suptitle(f"{fam} - {info['label']}", fontsize=11, fontweight="bold")
-    plt.tight_layout(); plt.show()
-"""),
-        md("## 6. Per-family training loss\n\nThe losses are not directly comparable across families, but they confirm that each saved recipe ran through its expected optimization path."),
-        code("""\
-fig, axes = plt.subplots(1, 3, figsize=(18, 4))
-for ax, (fam, info) in zip(axes, FAMILIES.items()):
-    log = logs_p5[fam]
-    if not log: ax.set_title(f"{fam} (missing)"); ax.axis("off"); continue
-    h = log["history"]; c = info["color"]
-    if fam == "GAN":
-        ax.plot(h["g_loss"], label="G loss", color=c, linewidth=1.2)
-        if "w_dist" in h:
-            ax.plot(h["w_dist"], label="W-dist", color=c, linewidth=1.2, linestyle="--")
-        ax.set_ylabel("Loss / W-distance")
-    elif fam == "VAE":
-        ax.plot(h["recon_loss"], label="recon", color=c, linewidth=1.2)
-        ax2 = ax.twinx()
-        ax2.plot(h["kl_loss"], label="KL", color=c, alpha=0.5, linestyle="--")
-        ax2.set_ylabel("KL")
-        ax.set_ylabel("Recon")
-    else:  # DDPM
-        ax.plot(h["loss"], color=c, linewidth=1.2, label="loss")
-        ax.set_ylabel("MSE on v-prediction")
-    ax.set_xlabel("Epoch"); ax.set_title(f"{fam}"); ax.legend(loc="upper right", fontsize=8)
-plt.tight_layout(); plt.show()
-"""),
-        md("""\
-## 7. Latent interpolation - GAN and VAE
-
-Smooth interpolation between two latent codes reveals whether the generator has
-learned a continuous manifold rather than a sparse memorisation. DDPM has no
-encoder, so this section is GAN/VAE only.
-
-The interpolation figures are not a ranking metric. They are included to make
-the learned representation easier to inspect: smooth transitions support the
-claim that the models learned a face manifold, while sudden jumps would suggest
-fragile or memorised structure.
-
-**Checkpoint loading note:** the cell below uses the same priority as
-`tools/sampling.py`: `final_ema` first, then `best_ema` as fallback. This avoids
-using a best-FID EMA snapshot that may have been saved very early for a
-slowly-converging run.
-"""),
-        code("""\
-import sys
-sys.path.insert(0, str(GENERATOR_ROOT))
-import torch
-from src.models import get_model
-from src.utils import load_config
-
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-
-def load_ema_model(run_name, config_name):
-    cfg = load_config(str(GENERATOR_ROOT / "configs" / "phase5" / config_name))
-    model_obj, _ = get_model(cfg)
-    model = model_obj[0] if isinstance(model_obj, tuple) else model_obj
-    # final_ema first; see the checkpoint loading note above.
-    for fname in [f"{run_name}_final_ema.pt", f"{run_name}_best_ema.pt"]:
-        p = GENERATOR_ROOT / "outputs" / "models" / fname
-        if not p.exists(): continue
-        sd = torch.load(p, map_location=DEVICE, weights_only=True)
-        missing, unexpected = model.load_state_dict(sd, strict=False)
-        if not missing and not unexpected:
-            print(f"  loaded {fname}")
-            return model.to(DEVICE).eval(), cfg
-    raise FileNotFoundError(f"No usable EMA checkpoint for {run_name}")
-
-
-def slerp(z1, z2, t):
-    z1n = z1 / z1.norm(); z2n = z2 / z2.norm()
-    omega = torch.acos((z1n * z2n).sum().clamp(-1, 1))
-    if omega.abs() < 1e-6: return (1 - t) * z1 + t * z2
-    return (torch.sin((1 - t) * omega) / torch.sin(omega)) * z1 + \\
-           (torch.sin(t * omega) / torch.sin(omega)) * z2
-"""),
-        code("""\
-# GAN slerp interpolation
-try:
-    gan_model, gan_cfg = load_ema_model("p5_gan", "p5_gan.json")
-    latent_dim = gan_cfg.get("latent_dim", 128)
-    z1 = torch.randn(1, latent_dim, 1, 1, device=DEVICE)
-    z2 = torch.randn(1, latent_dim, 1, 1, device=DEVICE)
-    with torch.no_grad():
-        ts = torch.linspace(0, 1, 10)
-        zs = torch.cat([slerp(z1, z2, t) for t in ts])
-        imgs = gan_model(zs).clamp(-1, 1)
-    imgs = (imgs + 1) / 2
-    fig, axes = plt.subplots(1, 10, figsize=(20, 2.5))
-    for ax, img, t in zip(axes, imgs.cpu(), ts):
-        ax.imshow(img.permute(1, 2, 0).numpy())
-        ax.set_title(f"t={float(t):.2f}", fontsize=8)
-        ax.axis("off")
-    fig.suptitle("GAN - slerp latent interpolation z1 -> z2", fontsize=11, fontweight="bold")
-    plt.tight_layout(); plt.show()
-except Exception as e:
-    print(f"GAN interpolation skipped: {e}")
-"""),
-        code("""\
-# VAE encode-interpolate-decode
-try:
-    from src.data import GeneratorDataset, get_transform
-    vae_model, vae_cfg = load_ema_model("p5_vae", "p5_vae.json")
-    ds = GeneratorDataset(str(PROJECT_ROOT / vae_cfg["data_dir"]),
-                          sources=vae_cfg.get("sources", ["wiki"]),
-                          transform=get_transform(vae_cfg["image_size"], augment=False))
-    real = torch.stack([ds[i] for i in range(2)]).to(DEVICE)
-    with torch.no_grad():
-        mu, logvar = vae_model.encode(real)
-        z1, z2 = mu[0:1], mu[1:2]
-        ts = torch.linspace(0, 1, 10)
-        zs = torch.cat([(1 - t) * z1 + t * z2 for t in ts])
-        imgs = vae_model.decode(zs).clamp(-1, 1)
-    imgs = (imgs + 1) / 2
-    fig, axes = plt.subplots(1, 10, figsize=(20, 2.5))
-    for ax, img, t in zip(axes, imgs.cpu(), ts):
-        ax.imshow(img.permute(1, 2, 0).numpy())
-        ax.set_title(f"t={float(t):.2f}", fontsize=8)
-        ax.axis("off")
-    fig.suptitle("VAE - linear latent interpolation (encoded real images)", fontsize=11, fontweight="bold")
-    plt.tight_layout(); plt.show()
-except Exception as e:
-    print(f"VAE interpolation skipped: {e}")
-"""),
-        md(f"""\
-## 8. Final decision
-
-| Family | Architecture | Best FID | Train time |
-|---|---|---:|---:|
-""" + "\n".join(
-    f"| {n.split('_')[1].upper()} | "
-    f"{ {'p5_gan':'WGAN-GP + SN + Attn', 'p5_vae':'VAE + Perceptual + PatchGAN', 'p5_ddpm':'DDPM cosine v-pred wider'}[n] } | "
-    f"{v:.1f} | {t:.1f} min |"
-    for n, v, _, t in rows
-) + f"""
-
-The saved logs rank DDPM best by FID, GAN close enough to be the practical
-speed-quality alternative, and VAE clearly behind for prior-sample quality. The
-VAE remains useful when fast iteration or reconstruction behavior matters, but
-it is not the strongest final generator.
-
-**Practical sampling notes** (encoded in `tools/sampling.py`):
-- Prefer `*_final_ema.pt` before `*_best_ema.pt`.
-- DDPM sampling at DDIM-50 matches the training preview; DDIM-100 is for FID only.
-- GAN truncation is **off by default** (matches training); enable for sharper but less
-  diverse samples.
-
-**Final recommendation:** choose DDPM when maximum visual quality is the priority.
-Choose GAN when speed and quality both matter. Use VAE for fast prototyping or
-reconstruction-focused analysis.
-
-**Report conclusion:** Across the full pipeline, the project moves from raw
-baseline failure to a locked aligned pipeline, then to family-specific recipes.
-The final comparison supports DDPM as the best-quality generator and GAN as the
-best practical compromise.
-"""),
-    ]
-    write_nb(NOTEBOOK_SEQUENCE["phase5"], cells)
-
-
-# PHASE 6 - Final selected sample showcase
-def build_phase6():
-    cells = [
-        md("""\
-# 07 - Final Sample Showcase
-
-This final notebook is a small showcase chapter. Phase 5 compared the model
-families quantitatively; this notebook selects the three strongest individual
-images from a large generated pool for each final Phase 5 recipe.
-
-The candidate pool is the saved final-comparison output:
-20 grids per architecture x 16 images per grid = 320 candidates for each model
-family, or 960 individual generated images total.
-
-## What this phase changes
-
-No model is trained or fine-tuned. No FID is recomputed. The notebook only
-splits already generated Phase 5 grids into individual images, scores them with
-a deterministic visual-quality heuristic, and saves the top three examples per
-architecture.
-
-The score is useful for reproducible curation, but it is not a scientific
-quality metric. The Phase 5 FID ranking remains the main quantitative result.
-"""),
-        code(SHARED_IMPORTS + """\
-from PIL import Image, ImageDraw, ImageFont
-from skimage.color import rgb2gray, rgb2hsv
-from skimage.filters import laplace
-
-SHOWCASE_ROOT = OUTPUTS / "samples" / "final_showcase"
-FINAL_COMPARISON_ROOT = OUTPUTS / "samples" / "final_comparison"
-SHOWCASE_ROOT.mkdir(parents=True, exist_ok=True)
-(SHOWCASE_ROOT / "top_tiles").mkdir(parents=True, exist_ok=True)
-
-RUNS = {
-    "p5_gan": "GAN - WGAN-GP + SN + Attn",
-    "p5_vae": "VAE - perceptual + PatchGAN",
-    "p5_ddpm": "DDPM - cosine v-pred wider",
-}
-"""),
-        md("""\
-## 1. Candidate pool
-
-Each `grid_*.png` is a 4x4 generated sample matrix. The cell below confirms
-how many candidates are available per architecture.
-"""),
-        code("""\
-rows = []
-for run, label in RUNS.items():
-    grids = sorted((FINAL_COMPARISON_ROOT / run).glob("grid_*.png"))
-    rows.append({
-        "run": run,
-        "architecture": label,
-        "grids": len(grids),
-        "candidate_images": len(grids) * 16,
-        "folder": str((FINAL_COMPARISON_ROOT / run).relative_to(GENERATOR_ROOT)),
-    })
-candidate_pool_df = pd.DataFrame(rows)
-display(candidate_pool_df)
-"""),
-        md("""\
-## 2. Selection method
-
-The selector scores every tile using four simple image properties:
-
-- exposure: avoids images that are too dark or too bright
-- contrast: avoids flat gray outputs
-- detail: favors sharper structure
-- color: avoids very washed-out samples
-
-This is a report curation tool, not a replacement for FID or visual judgment.
-It simply makes the final showcase deterministic and auditable.
-"""),
-        code("""\
-def split_grid(path, nrow=4, padding=2):
-    img = Image.open(path).convert("RGB")
-    arr = np.asarray(img)
-    h, w = arr.shape[:2]
-    tile_w = (w - (nrow + 1) * padding) // nrow
-    nrows = (h - padding) // (tile_w + padding)
-    tiles = []
-    for r in range(nrows):
-        for c in range(nrow):
-            x0 = padding + c * (tile_w + padding)
-            y0 = padding + r * (tile_w + padding)
-            tile = arr[y0:y0 + tile_w, x0:x0 + tile_w]
-            if tile.shape[0] == tile_w and tile.shape[1] == tile_w:
-                tiles.append((r, c, tile))
-    return tiles
-
-
-def score_tile(tile):
-    x = tile.astype("float32") / 255.0
-    gray = rgb2gray(x)
-    hsv = rgb2hsv(x)
-    mean = float(gray.mean())
-    std = float(gray.std())
-    saturation = float(hsv[..., 1].mean())
-    sharp = float(np.var(laplace(gray)))
-    exposure = max(0.0, 1.0 - abs(mean - 0.48) / 0.32)
-    contrast = min(std / 0.24, 1.0)
-    detail = min(np.log1p(sharp * 6000.0) / 4.0, 1.0)
-    color = min(saturation / 0.38, 1.0)
-    score = 0.30 * exposure + 0.30 * contrast + 0.25 * detail + 0.15 * color
-    if std < 0.035:
-        score *= 0.15
-    if mean < 0.12 or mean > 0.88:
-        score *= 0.4
-    return {
-        "score": float(score),
-        "mean": mean,
-        "std": std,
-        "saturation": saturation,
-        "sharpness": sharp,
-        "exposure_score": exposure,
-        "contrast_score": contrast,
-        "detail_score": detail,
-        "color_score": color,
-    }
-
-
-records = []
-for run, label in RUNS.items():
-    grids = sorted((FINAL_COMPARISON_ROOT / run).glob("grid_*.png"))
-    for grid_path in grids:
-        grid_index = int(grid_path.stem.split("_")[-1])
-        for tile_index, (row, col, tile) in enumerate(split_grid(grid_path), start=1):
-            records.append({
-                "run": run,
-                "architecture": label,
-                "grid": grid_path.name,
-                "grid_index": grid_index,
-                "tile_index": tile_index,
-                "row": row,
-                "col": col,
-                "source_path": str(grid_path.relative_to(GENERATOR_ROOT)),
-                **score_tile(tile),
-            })
-
-candidate_scores_df = pd.DataFrame(records)
-candidate_scores_df.to_csv(SHOWCASE_ROOT / "candidate_scores.csv", index=False)
-display(candidate_scores_df.groupby("run")["score"].agg(["count", "mean", "max"]).reset_index())
-"""),
-        md("""\
-## 3. Top three per architecture
-
-The cell below saves the selected individual images and a combined showcase
-panel under `generator/outputs/samples/final_showcase`.
-"""),
-        code("""\
-selected_parts = []
-for run, group in candidate_scores_df.groupby("run", sort=False):
-    selected = group.sort_values("score", ascending=False).head(3).copy()
-    selected["rank"] = range(1, len(selected) + 1)
-    out_dir = SHOWCASE_ROOT / "top_tiles" / run
-    out_dir.mkdir(parents=True, exist_ok=True)
-    tile_paths = []
-    for _, row in selected.iterrows():
-        grid_path = GENERATOR_ROOT / row["source_path"]
-        tile = split_grid(grid_path)[int(row["tile_index"]) - 1][2]
-        tile_path = out_dir / f"rank_{int(row['rank']):02d}_{grid_path.stem}_tile_{int(row['tile_index']):02d}.png"
-        Image.fromarray(tile).save(tile_path)
-        tile_paths.append(str(tile_path.relative_to(GENERATOR_ROOT)))
-    selected["tile_path"] = tile_paths
-    selected_parts.append(selected)
-
-selected_top3_df = pd.concat(selected_parts, ignore_index=True).sort_values(["run", "rank"])
-selected_top3_df.to_csv(SHOWCASE_ROOT / "selected_top3.csv", index=False)
-selected_top3_df.to_json(SHOWCASE_ROOT / "selected_top3.json", orient="records", indent=2)
-
-tile_size = 128
-label_h = 46
-cols = 3
-rows = len(RUNS)
-panel = Image.new("RGB", (cols * tile_size, rows * (tile_size + label_h)), "white")
-draw = ImageDraw.Draw(panel)
-try:
-    font = ImageFont.truetype("arial.ttf", 13)
-    font_small = ImageFont.truetype("arial.ttf", 11)
-except Exception:
-    font = ImageFont.load_default()
-    font_small = ImageFont.load_default()
-
-for r, (run, label) in enumerate(RUNS.items()):
-    group = selected_top3_df[selected_top3_df["run"] == run].sort_values("rank")
-    y_base = r * (tile_size + label_h)
-    draw.text((4, y_base + 2), label, fill="black", font=font)
-    for c, (_, row) in enumerate(group.iterrows()):
-        tile = Image.open(GENERATOR_ROOT / row["tile_path"]).convert("RGB").resize((tile_size, tile_size), Image.Resampling.BICUBIC)
-        x = c * tile_size
-        y = y_base + label_h
-        panel.paste(tile, (x, y))
-        draw.text((x + 4, y_base + 22), f"rank {int(row['rank'])} score {row['score']:.3f}", fill="black", font=font_small)
-
-panel_path = SHOWCASE_ROOT / "phase5_top3_panel.png"
-panel.save(panel_path)
-display(selected_top3_df[["run", "rank", "score", "grid", "tile_index", "tile_path"]])
-print(f"Saved panel: {panel_path.relative_to(GENERATOR_ROOT)}")
-"""),
-        md("""\
-## 4. Final selected images
-
-These are the top three selected images for each Phase 5 architecture.
-"""),
-        code("""\
-panel_path = SHOWCASE_ROOT / "phase5_top3_panel.png"
-plt.figure(figsize=(8, 10))
-plt.imshow(mpimg.imread(str(panel_path)))
-plt.axis("off")
-plt.title("Phase 5 selected top-3 images per architecture")
-plt.show()
-"""),
-        md("""\
-## 5. Report conclusion
-
-The showcase supports the Phase 5 conclusion visually: DDPM gives the cleanest
-best-case samples, GAN is close and sharper than the VAE in many examples, and
-the VAE remains smoother and more conservative. These images are curated from a
-large generated pool, so they should be used as final qualitative examples, not
-as a replacement for the full distribution-level metrics.
-"""),
-    ]
-    write_nb(NOTEBOOK_SEQUENCE["phase6"], cells)
-
-
-if __name__ == "__main__":
-    print("Building notebooks...")
-    remove_old_generated_notebooks()
-    build_phase0()
-    build_phase1()
-    build_phase2()
-    build_phase3()
-    build_phase4()
-    build_phase5()
-    build_phase6()
-    print("Done.")