Trying a few different VAE settings

This commit is contained in:
Johnny Fernandes
2026-05-02 00:32:45 +01:00
parent 1a7f67ab9c
commit f89d7dcfda
6 changed files with 71 additions and 43 deletions
@@ -7,6 +7,8 @@
"model": "vae", "model": "vae",
"latent_dim": 256, "latent_dim": 256,
"ngf": 64, "ngf": 64,
"free_bits": 0.1,
"grad_clip": 1.0,
"sample_interval": 10, "sample_interval": 10,
"fid_interval": 25, "fid_interval": 25,
"fid_n_real": 5000 "fid_n_real": 5000
+2 -2
View File
@@ -1,8 +1,8 @@
{ {
"extends": "_base_phase3.json", "extends": "_base_phase3.json",
"run_name": "p3_1_vae", "run_name": "p3_1_vae",
"lr": 1e-3, "lr": 5e-4,
"beta_kl": 1.0, "beta_kl": 0.005,
"lambda_perceptual": 0.0, "lambda_perceptual": 0.0,
"lambda_adversarial": 0.0 "lambda_adversarial": 0.0
} }
@@ -1,8 +1,8 @@
{ {
"extends": "_base_phase3.json", "extends": "_base_phase3.json",
"run_name": "p3_2_vae_perceptual", "run_name": "p3_2_vae_perceptual",
"lr": 1e-3, "lr": 5e-4,
"beta_kl": 0.0001, "beta_kl": 0.005,
"lambda_perceptual": 0.1, "lambda_perceptual": 0.1,
"lambda_adversarial": 0.0 "lambda_adversarial": 0.0
} }
@@ -1,9 +1,9 @@
{ {
"extends": "_base_phase3.json", "extends": "_base_phase3.json",
"run_name": "p3_3_vae_patchgan", "run_name": "p3_3_vae_patchgan",
"lr": 1e-3, "lr": 5e-4,
"lr_d": 1e-4, "lr_d": 1e-4,
"beta_kl": 0.0001, "beta_kl": 0.005,
"lambda_perceptual": 0.1, "lambda_perceptual": 0.1,
"lambda_adversarial": 0.1, "lambda_adversarial": 0.1,
"ndf_patch": 64 "ndf_patch": 64
+2 -1
View File
@@ -98,7 +98,8 @@ class VAE(nn.Module):
def encode(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: def encode(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
h = self.encoder(x).flatten(1) h = self.encoder(x).flatten(1)
return self.fc_mu(h), self.fc_lv(h) log_var = self.fc_lv(h).clamp(-10.0, 10.0)
return self.fc_mu(h), log_var
def reparameterize(self, mu: torch.Tensor, log_var: torch.Tensor) -> torch.Tensor: def reparameterize(self, mu: torch.Tensor, log_var: torch.Tensor) -> torch.Tensor:
std = torch.exp(0.5 * log_var) std = torch.exp(0.5 * log_var)
+52 -27
View File
@@ -403,13 +403,20 @@ def train_vae(
run_name: str, run_name: str,
device: str = "cuda", device: str = "cuda",
) -> dict: ) -> dict:
"""VAE training loop covering Phase 3.1 3.3. """VAE training loop covering Phase 3.1 3.3 and Phase 5.
Config toggles: Config toggles:
lambda_perceptual > 0 → VGG-16 perceptual loss (Phase 3.2+) lambda_perceptual > 0 → VGG-16 perceptual loss (Phase 3.2+)
lambda_adversarial > 0 → PatchGAN hinge adversarial loss (Phase 3.3) lambda_adversarial > 0 → PatchGAN hinge adversarial loss (Phase 3.3)
free_bits > 0 → per-dimension KL free bits (prevents posterior
collapse and KL explosion)
Loss: L = L_mse + λ_perc·L_vgg + λ_adv·L_adv + β_kl·L_kl Loss: L = L_mse + λ_perc·L_vgg + λ_adv·L_adv + β_kl·L_kl
AMP is intentionally disabled for VAE training — mixed-precision float16
overflows when the KL divergence spikes, producing NaN cascades that
corrupt the model irrecoverably. All VAE + perceptual + PatchGAN
computation runs in float32.
""" """
device = torch.device(device if torch.cuda.is_available() else "cpu") device = torch.device(device if torch.cuda.is_available() else "cpu")
vae = vae.to(device) vae = vae.to(device)
@@ -425,6 +432,8 @@ def train_vae(
lambda_perceptual = cfg.get("lambda_perceptual", 0.0) lambda_perceptual = cfg.get("lambda_perceptual", 0.0)
lambda_adversarial = cfg.get("lambda_adversarial", 0.0) lambda_adversarial = cfg.get("lambda_adversarial", 0.0)
lr_d = cfg.get("lr_d", 1e-4) lr_d = cfg.get("lr_d", 1e-4)
free_bits_val = cfg.get("free_bits", 0.0)
grad_clip = cfg.get("grad_clip", 1.0)
ema_decay = cfg.get("ema_decay", 0.9999) ema_decay = cfg.get("ema_decay", 0.9999)
sample_interval = cfg.get("sample_interval", 10) sample_interval = cfg.get("sample_interval", 10)
fid_interval = cfg.get("fid_interval", 25) fid_interval = cfg.get("fid_interval", 25)
@@ -432,6 +441,7 @@ def train_vae(
use_perceptual = lambda_perceptual > 0 use_perceptual = lambda_perceptual > 0
use_adversarial = lambda_adversarial > 0 use_adversarial = lambda_adversarial > 0
use_free_bits = free_bits_val > 0
loader = DataLoader( loader = DataLoader(
train_dataset, batch_size=batch_size, shuffle=True, train_dataset, batch_size=batch_size, shuffle=True,
@@ -440,8 +450,8 @@ def train_vae(
) )
opt_vae = torch.optim.Adam(vae.parameters(), lr=lr) opt_vae = torch.optim.Adam(vae.parameters(), lr=lr)
use_amp = device.type == "cuda" # AMP disabled — float16 overflows on KL spikes, causing NaN cascades
scaler = _GradScaler("cuda", enabled=use_amp) use_amp = False
# KL warmup: linearly ramp beta_kl from 0 to target over first 20% of training # KL warmup: linearly ramp beta_kl from 0 to target over first 20% of training
kl_warmup_epochs = max(1, epochs // 5) kl_warmup_epochs = max(1, epochs // 5)
@@ -456,11 +466,10 @@ def train_vae(
perc_fn = None perc_fn = None
patchgan = None patchgan = None
opt_d = None opt_d = None
scaler_d = None
if use_perceptual: if use_perceptual:
from src.training.perceptual import PerceptualLoss from src.training.perceptual import PerceptualLoss
perc_fn = PerceptualLoss().to(device) perc_fn = PerceptualLoss().to(device).float()
print("Perceptual loss: VGG-16 relu1_2 + relu2_2 + relu3_3") print("Perceptual loss: VGG-16 relu1_2 + relu2_2 + relu3_3")
if use_adversarial: if use_adversarial:
@@ -468,15 +477,14 @@ def train_vae(
patchgan = PatchGANDiscriminator( patchgan = PatchGANDiscriminator(
ndf=cfg.get("ndf_patch", 64), ndf=cfg.get("ndf_patch", 64),
image_size=cfg.get("image_size", 64), image_size=cfg.get("image_size", 64),
).to(device) ).to(device).float()
opt_d = torch.optim.Adam(patchgan.parameters(), lr=lr_d, betas=(0.5, 0.999)) opt_d = torch.optim.Adam(patchgan.parameters(), lr=lr_d, betas=(0.5, 0.999))
scaler_d = _GradScaler("cuda", enabled=use_amp)
sched_d = torch.optim.lr_scheduler.LambdaLR( sched_d = torch.optim.lr_scheduler.LambdaLR(
opt_d, lr_lambda=lambda ep: max(0.0, 1.0 - max(ep - decay_start, 0) / max(epochs - decay_start, 1))) opt_d, lr_lambda=lambda ep: max(0.0, 1.0 - max(ep - decay_start, 0) / max(epochs - decay_start, 1)))
n_d = sum(p.numel() for p in patchgan.parameters()) n_d = sum(p.numel() for p in patchgan.parameters())
print(f"PatchGAN: {n_d:,} params") print(f"PatchGAN: {n_d:,} params")
else: else:
hinge_d_loss = hinge_g_loss = None # satisfy linter, never called hinge_d_loss = hinge_g_loss = None # never called
# ── Fixed seeds for consistent visualisation ────────────────────────── # ── Fixed seeds for consistent visualisation ──────────────────────────
fixed_z = torch.randn(16, latent_dim, device=device) fixed_z = torch.randn(16, latent_dim, device=device)
@@ -497,9 +505,11 @@ def train_vae(
"adv_g_loss": [], "adv_d_loss": [], "fid": {}, "adv_g_loss": [], "adv_d_loss": [], "fid": {},
} }
best_fid = float("inf") best_fid = float("inf")
nan_skipped = 0
print( print(
f"Device: {device} AMP: {use_amp} Batches/epoch: {len(loader)}" f"Device: {device} AMP: disabled (float32) Batches/epoch: {len(loader)}"
f" β_kl={beta_kl} (warmup {kl_warmup_epochs}ep) λ_perc={lambda_perceptual} λ_adv={lambda_adversarial}" f" β_kl={beta_kl} (warmup {kl_warmup_epochs}ep) λ_perc={lambda_perceptual}"
f" λ_adv={lambda_adversarial} free_bits={free_bits_val}"
) )
t_start = time.time() t_start = time.time()
@@ -513,43 +523,56 @@ def train_vae(
n_batches = 0 n_batches = 0
for real in tqdm(loader, desc=f"Epoch {epoch}/{epochs}", leave=False): for real in tqdm(loader, desc=f"Epoch {epoch}/{epochs}", leave=False):
real = real.to(device) real = real.to(device).float()
# KL warmup: ramp from 0 to beta_kl over kl_warmup_epochs # KL warmup: ramp from 0 to beta_kl over kl_warmup_epochs
current_beta = beta_kl * min(1.0, epoch / kl_warmup_epochs) current_beta = beta_kl * min(1.0, epoch / kl_warmup_epochs)
# ── VAE forward ─────────────────────────────────────────────── # ── VAE forward (float32, no AMP) ────────────────────────────
with _autocast("cuda", enabled=use_amp):
recon, mu, log_var = vae(real) recon, mu, log_var = vae(real)
mse = F.mse_loss(recon, real) mse = F.mse_loss(recon, real)
kl = -0.5 * (1 + log_var - mu.pow(2) - log_var.exp()).sum(1).mean()
# KL divergence with optional free bits
kl_per_dim = -0.5 * (1 + log_var - mu.pow(2) - log_var.exp()) # (B, latent_dim)
if use_free_bits:
# Free bits: ensure each dimension contributes at least free_bits_val KL.
# Dimensions below the threshold are raised to it, preventing posterior
# collapse (dimensions that go to 0) while still penalising large KL.
kl_per_dim = torch.clamp(kl_per_dim, min=free_bits_val)
kl = kl_per_dim.sum(1).mean()
perc = perc_fn(recon, real) if use_perceptual else real.new_zeros(1).squeeze() perc = perc_fn(recon, real) if use_perceptual else real.new_zeros(1).squeeze()
vae_loss = mse + current_beta * kl + lambda_perceptual * perc vae_loss = mse + current_beta * kl + lambda_perceptual * perc
# ── NaN/Inf guard ────────────────────────────────────────────
if not torch.isfinite(vae_loss):
nan_skipped += 1
opt_vae.zero_grad()
continue
# ── PatchGAN discriminator step ─────────────────────────────── # ── PatchGAN discriminator step ───────────────────────────────
adv_d = real.new_zeros(1).squeeze() adv_d = real.new_zeros(1).squeeze()
if use_adversarial: if use_adversarial:
opt_d.zero_grad() opt_d.zero_grad()
with _autocast("cuda", enabled=use_amp):
d_real = patchgan(real) d_real = patchgan(real)
d_fake = patchgan(recon.detach()) d_fake = patchgan(recon.detach())
adv_d = hinge_d_loss(d_real, d_fake) adv_d = hinge_d_loss(d_real, d_fake)
scaler_d.scale(adv_d).backward() if torch.isfinite(adv_d):
scaler_d.step(opt_d) adv_d.backward()
scaler_d.update() torch.nn.utils.clip_grad_norm_(patchgan.parameters(), grad_clip)
opt_d.step()
# ── PatchGAN generator adversarial loss ─────────────────────── # ── PatchGAN generator adversarial loss ───────────────────────
adv_g = real.new_zeros(1).squeeze() adv_g = real.new_zeros(1).squeeze()
if use_adversarial: if use_adversarial:
with _autocast("cuda", enabled=use_amp):
adv_g = hinge_g_loss(patchgan(recon)) adv_g = hinge_g_loss(patchgan(recon))
vae_loss = vae_loss + lambda_adversarial * adv_g vae_loss = vae_loss + lambda_adversarial * adv_g
# ── VAE backward ────────────────────────────────────────────── # ── VAE backward ──────────────────────────────────────────────
opt_vae.zero_grad() opt_vae.zero_grad()
scaler.scale(vae_loss).backward() vae_loss.backward()
scaler.step(opt_vae) torch.nn.utils.clip_grad_norm_(vae.parameters(), grad_clip)
scaler.update() opt_vae.step()
ema.update(vae) ema.update(vae)
recon_sum += mse.item() recon_sum += mse.item()
@@ -559,11 +582,11 @@ def train_vae(
adv_d_sum += adv_d.item() adv_d_sum += adv_d.item()
n_batches += 1 n_batches += 1
avg_r = recon_sum / n_batches avg_r = recon_sum / max(n_batches, 1)
avg_k = kl_sum / n_batches avg_k = kl_sum / max(n_batches, 1)
avg_p = perc_sum / n_batches avg_p = perc_sum / max(n_batches, 1)
avg_g = adv_g_sum / n_batches avg_g = adv_g_sum / max(n_batches, 1)
avg_d = adv_d_sum / n_batches avg_d = adv_d_sum / max(n_batches, 1)
history["recon_loss"].append(avg_r) history["recon_loss"].append(avg_r)
history["kl_loss"].append(avg_k) history["kl_loss"].append(avg_k)
history["perc_loss"].append(avg_p) history["perc_loss"].append(avg_p)
@@ -574,6 +597,7 @@ def train_vae(
f"[{epoch:03d}/{epochs}] " f"[{epoch:03d}/{epochs}] "
f"MSE: {avg_r:.4f} KL: {avg_k:.2f} β={current_beta:.6f} " f"MSE: {avg_r:.4f} KL: {avg_k:.2f} β={current_beta:.6f} "
f"Perc: {avg_p:.4f} AdvG: {avg_g:.4f} AdvD: {avg_d:.4f}" f"Perc: {avg_p:.4f} AdvG: {avg_g:.4f} AdvD: {avg_d:.4f}"
f" (NaN skipped: {nan_skipped})"
) )
if epoch % sample_interval == 0: if epoch % sample_interval == 0:
@@ -607,6 +631,7 @@ def train_vae(
if patchgan is not None: if patchgan is not None:
torch.save(patchgan.state_dict(), save_dir / f"{run_name}_final_patchgan.pt") torch.save(patchgan.state_dict(), save_dir / f"{run_name}_final_patchgan.pt")
history["train_time_s"] = time.time() - t_start history["train_time_s"] = time.time() - t_start
print(f"Total NaN-skipped batches: {nan_skipped}")
return history return history