""" Download the DeepFakeFace dataset from HuggingFace and extract it. Usage: python tools/download_data.py python tools/download_data.py --data-dir /mnt/data/DFF """ import argparse import zipfile from pathlib import Path from huggingface_hub import snapshot_download SOURCES = ["wiki", "inpainting", "text2img", "insight"] def download(data_dir: Path) -> None: print(f"Downloading dataset from HuggingFace into {data_dir}...") snapshot_download( repo_id="OpenRL/DeepFakeFace", repo_type="dataset", local_dir=data_dir, ) for source in SOURCES: zip_path = data_dir / f"{source}.zip" target_dir = data_dir / source if target_dir.exists(): print(f" {source}/ already extracted, skipping") continue if not zip_path.exists(): print(f" WARNING: {zip_path} not found, skipping") continue print(f" Extracting {zip_path.name}...") with zipfile.ZipFile(zip_path, "r") as z: z.extractall(data_dir) print(f" Done -> {target_dir}") print("\nVerifying...") for source in SOURCES: d = data_dir / source count = sum(1 for _ in d.rglob("*.jpg")) if d.exists() else 0 print(f" {source}: {count} images") if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( "--data-dir", default="data", help="Directory to download into. Default: data", ) args = parser.parse_args() download(Path(args.data_dir))