DRL_PROJ/pipeline/vast_api.py

import json
from dataclasses import dataclass
from typing import Any
from urllib import error, request


# Generic error raised for any Vast.ai API failure
class VastApiError(RuntimeError):
    pass


# Lightweight view of a Vast.ai instance with the fields the pipeline cares about
@dataclass(slots=True)
class VastInstance:
    id: int
    actual_status: str
    ssh_host: str | None
    ssh_port: int | None
    public_ipaddr: str | None
    gpu_name: str | None
    dph_total: float | None
    raw: dict[str, Any]


# Thin wrapper around the Vast.ai REST API
class VastApiClient:
    def __init__(self, api_key: str, *, base_url: str = "https://console.vast.ai") -> None:
        self.api_key = api_key
        self.base_url = base_url.rstrip("/")

    # Low-level request helper — sends JSON, returns parsed response body
    def _request(self, method: str, path: str, payload: dict[str, Any] | None = None) -> Any:
        url = f"{self.base_url}{path}"
        data = None
        headers = {"Authorization": f"Bearer {self.api_key}"}
        if payload is not None:
            headers["Content-Type"] = "application/json"
            data = json.dumps(payload).encode("utf-8")

        req = request.Request(url, method=method, data=data, headers=headers)
        try:
            with request.urlopen(req, timeout=60) as response:
                body = response.read()

        # Vast.ai returns varied error formats; surface whatever body we get
        except error.HTTPError as exc:
            details = exc.read().decode("utf-8", errors="replace")
            raise VastApiError(f"{method} {path} failed with {exc.code}: {details}") from exc
        except error.URLError as exc:
            raise VastApiError(f"{method} {path} failed: {exc.reason}") from exc

        if not body:
            return None
        return json.loads(body)

    # Fetch the currently authenticated user's profile
    def show_user(self) -> dict[str, Any]:
        return self._request("GET", "/api/v0/users/current/")

    # ── SSH keys ────────────────────────────────────────────────────────

    # List registered SSH keys; handles inconsistent response shapes from the API
    def show_ssh_keys(self) -> list[dict[str, Any]]:
        response = self._request("GET", "/api/v0/ssh/")
        if isinstance(response, list):
            return response
        if isinstance(response, dict):
            for key in ("keys", "ssh_keys"):
                value = response.get(key)
                if isinstance(value, list):
                    return value
        raise VastApiError(f"Unexpected SSH key response: {response}")

    # Register the public key if it isn't already present
    def ensure_ssh_key(self, public_key: str) -> None:
        existing_keys = self.show_ssh_keys()
        if any(
            (
                item.get("key")
                or item.get("public_key")
                or item.get("ssh_key")
                or ""
            ).strip() == public_key
            for item in existing_keys
        ):
            return
        self._request("POST", "/api/v0/ssh/", {"ssh_key": public_key})

    # Authorise an SSH key for a running instance
    def attach_ssh_key(self, instance_id: int, public_key: str) -> None:
        self._request("POST", f"/api/v0/instances/{instance_id}/ssh/", {"ssh_key": public_key})

    # ── Offers ─────────────────────────────────────────────────────────

    # Search available GPU offers matching a query filter
    def search_offers(self, query: dict[str, Any]) -> list[dict[str, Any]]:
        response = self._request("POST", "/api/v0/bundles/", query)
        offers = response.get("offers", [])
        if isinstance(offers, dict):
            return [offers]
        return offers

    # ── Instances ──────────────────────────────────────────────────────

    # Rent an offer, returning the new contract (instance) ID
    def create_instance(self, offer_id: int, payload: dict[str, Any]) -> int:
        response = self._request("PUT", f"/api/v0/asks/{offer_id}/", payload)
        if not response or not response.get("success"):
            raise VastApiError(f"Instance creation failed for offer {offer_id}: {response}")
        return int(response["new_contract"])

    # Fetch current status and connection details for an instance
    def show_instance(self, instance_id: int) -> VastInstance:
        response = self._request("GET", f"/api/v0/instances/{instance_id}/")
        raw = response.get("instances")
        if not raw:
            raise VastApiError(f"No instance details found for {instance_id}: {response}")
        return VastInstance(
            id=int(raw["id"]),
            actual_status=raw.get("actual_status", ""),
            ssh_host=raw.get("ssh_host"),
            ssh_port=raw.get("ssh_port"),
            public_ipaddr=raw.get("public_ipaddr"),
            gpu_name=raw.get("gpu_name"),
            dph_total=raw.get("dph_total"),
            raw=raw,
        )

    # Permanently destroy an instance (releases the GPU and billing)
    def destroy_instance(self, instance_id: int) -> None:
        self._request("DELETE", f"/api/v0/instances/{instance_id}/")