odysseus/scripts/import_from_vllm_recipes.py

#!/usr/bin/env python3
"""Import models from the upstream vllm-project/recipes catalog into our
local hf_models.json. Two modes:

  --update-existing  Stamp min_vllm_version + vllm_recipe=True on rows we
                     already carry. Cheap, no HF API calls.
  --add-missing      Create new catalog rows for every recipe model we
                     don't carry. Hits the HF API for created_at + downloads
                     (~1 req per missing model, paced).

Both modes write atomically (tmp + rename) so a crashed run leaves the
catalog intact. Default with no mode flags runs both, prefer to pass them
explicitly.

Usage:
    python scripts/import_from_vllm_recipes.py --update-existing
    python scripts/import_from_vllm_recipes.py --add-missing
    python scripts/import_from_vllm_recipes.py --dry-run
    python scripts/import_from_vllm_recipes.py --limit 10

Auth: set HF_TOKEN to access gated repos when --add-missing.
"""
import argparse
import json
import os
import re
import sys
import time
from datetime import datetime
from pathlib import Path

try:
    import httpx
    import yaml
except ImportError:
    print("pip install httpx PyYAML", file=sys.stderr)
    sys.exit(1)

try:
    from huggingface_hub import HfApi
    from huggingface_hub.utils import HfHubHTTPError
except ImportError:
    HfApi = None
    HfHubHTTPError = Exception


CATALOG_PATH = Path(__file__).resolve().parent.parent / "services" / "hwfit" / "data" / "hf_models.json"
RECIPES_TREE_URL = (
    "https://api.github.com/repos/vllm-project/recipes/git/trees/main?recursive=1"
)
RECIPE_RAW_URL = (
    "https://raw.githubusercontent.com/vllm-project/recipes/main/models/{repo}.yaml"
)


# Map recipe `precision` to the closest catalog `quantization` label that
# fit.py / models.py already understand.
_PRECISION_TO_QUANT = {
    "fp8": "FP8",
    "nvfp4": "NVFP4",
    "mxfp4": "MXFP4",
    "bf16": "BF16",
    "fp16": "F16",
    "f16": "F16",
    "fp4": "FP4",
    "int8": "INT8",
    "int4": "INT4",
    "awq-4bit": "AWQ-4bit",
    "awq-8bit": "AWQ-8bit",
}

# Architecture name → use_case fallback. fit.py weights use_case for filtering;
# missing field defaults to a generic bucket.
_ARCH_USE_CASE = {
    "moe": "General-purpose reasoning, long-context",
    "llama": "General-purpose chat",
    "qwen2": "General-purpose chat",
    "qwen3": "General-purpose reasoning",
    "deepseek_v3_moe": "General-purpose reasoning, long-context",
    "deepseek_v4_moe": "General-purpose reasoning, long-context",
}


def _parse_param_count(s) -> int:
    """'230B' / '8.6B' / '4.2T' → integer parameter count."""
    if s is None:
        return 0
    s = str(s).strip().replace(",", "")
    m = re.match(r"^([\d.]+)\s*([KMBT]?)$", s, re.I)
    if not m:
        return 0
    num = float(m.group(1))
    unit = (m.group(2) or "").upper()
    mult = {"K": 1e3, "M": 1e6, "B": 1e9, "T": 1e12, "": 1.0}[unit]
    return int(num * mult)


def _capabilities_for(arch: str, hardware: dict, ctx_len: int, has_reasoning: bool) -> list[str]:
    caps = []
    if "moe" in (arch or "").lower():
        caps.append("moe")
    if has_reasoning:
        caps.append("reasoning")
    if ctx_len and ctx_len >= 100_000:
        caps.append("long_context")
    if any(hw in (hardware or {}) for hw in ("mi300x", "mi325x", "mi350x", "mi355x")):
        caps.append("amd_supported")
    return caps


def _fetch_manifest(client: httpx.Client) -> set[str]:
    r = client.get(RECIPES_TREE_URL, headers={"Accept": "application/vnd.github+json"}, timeout=15)
    r.raise_for_status()
    tree = (r.json() or {}).get("tree") or []
    out: set[str] = set()
    for e in tree:
        path = (e or {}).get("path") or ""
        if path.startswith("models/") and path.endswith(".yaml"):
            body = path[len("models/"):-len(".yaml")]
            if "/" in body:
                out.add(body)
    return out


def _fetch_recipe(client: httpx.Client, repo: str) -> dict | None:
    url = RECIPE_RAW_URL.format(repo=repo)
    try:
        r = client.get(url, timeout=10)
        if r.status_code != 200:
            return None
        return yaml.safe_load(r.text) or {}
    except Exception:
        return None


def _stamp_from_recipe(entry: dict, recipe: dict) -> bool:
    """Mutate entry with recipe-derived fields. Returns True if anything changed."""
    model = recipe.get("model") or {}
    meta = recipe.get("meta") or {}
    features = recipe.get("features") or {}

    changed = False
    new_min = (model.get("min_vllm_version") or "").strip()
    if new_min and entry.get("min_vllm_version") != new_min:
        entry["min_vllm_version"] = new_min
        changed = True
    if not entry.get("vllm_recipe"):
        entry["vllm_recipe"] = True
        changed = True
    # Hardware support map — useful for filtering "which models run on my AMD box".
    hw = meta.get("hardware") or {}
    if hw and entry.get("recipe_hardware") != hw:
        entry["recipe_hardware"] = {k: str(v) for k, v in hw.items()}
        changed = True
    # Tool/reasoning parser hints — purely informational at catalog level;
    # the live launch command builder still reads them from the recipe API.
    if features.get("reasoning") and not entry.get("has_reasoning_parser"):
        entry["has_reasoning_parser"] = True
        changed = True
    if features.get("tool_calling") and not entry.get("has_tool_call_parser"):
        entry["has_tool_call_parser"] = True
        changed = True
    return changed


def _build_new_entry(repo: str, recipe: dict, hf_info=None) -> dict | None:
    """Build a fresh catalog entry from a recipe + (optional) HF model info."""
    model = recipe.get("model") or {}
    meta = recipe.get("meta") or {}
    features = recipe.get("features") or {}
    variants = recipe.get("variants") or {}

    org, name = repo.split("/", 1)
    raw_params = _parse_param_count(model.get("parameter_count"))
    active_raw = _parse_param_count(model.get("active_parameters"))
    ctx = model.get("context_length") or 0

    # Pick the smallest-VRAM variant as the catalog quant — that's what most
    # users land on first. NVFP4/MXFP4 typically win this on Blackwell;
    # FP8 elsewhere; BF16 baseline only.
    pick_quant = None
    pick_vram = None
    for vk, vv in variants.items():
        if not isinstance(vv, dict):
            continue
        prec = (vv.get("precision") or "").lower()
        vram = vv.get("vram_minimum_gb") or 0
        quant = _PRECISION_TO_QUANT.get(prec)
        if quant and (pick_vram is None or (vram and vram < pick_vram)):
            pick_quant = quant
            pick_vram = vram or pick_vram
    if not pick_quant:
        pick_quant = "BF16"

    arch = (model.get("architecture") or "").lower()
    use_case = _ARCH_USE_CASE.get(arch, "General-purpose chat")
    caps = _capabilities_for(arch, meta.get("hardware") or {}, ctx, bool(features.get("reasoning")))

    rel_date = ""
    downloads = 0
    likes = 0
    if hf_info is not None:
        created = getattr(hf_info, "created_at", None)
        if created:
            rel_date = created.strftime("%Y-%m-%d")
        downloads = int(getattr(hf_info, "downloads", 0) or 0)
        likes = int(getattr(hf_info, "likes", 0) or 0)
    if not rel_date:
        rel_date = str(meta.get("date_updated") or datetime.utcnow().strftime("%Y-%m-%d"))

    entry: dict = {
        "name": repo,
        "provider": org,
        "parameter_count": str(model.get("parameter_count") or "?"),
        "parameters_raw": raw_params,
        "is_moe": "moe" in arch,
        "quantization": pick_quant,
        "context_length": int(ctx or 0),
        "use_case": use_case,
        "capabilities": caps,
        "pipeline_tag": "text-generation",
        "architecture": arch or "unknown",
        "hf_downloads": downloads,
        "hf_likes": likes,
        "release_date": rel_date,
        # Recipe-derived bits.
        "vllm_recipe": True,
        "min_vllm_version": (model.get("min_vllm_version") or "").strip() or None,
        "recipe_hardware": {k: str(v) for k, v in (meta.get("hardware") or {}).items()},
        "has_reasoning_parser": bool(features.get("reasoning")),
        "has_tool_call_parser": bool(features.get("tool_calling")),
    }
    if active_raw:
        entry["active_parameters"] = active_raw
    if pick_vram:
        # min_vram_gb is what hwfit uses for "does this fit". Recipe states a
        # minimum for the chosen variant; round up slightly for KV-cache room.
        entry["min_vram_gb"] = float(pick_vram)
        entry["min_ram_gb"] = float(round(pick_vram * 0.6, 1))
        entry["recommended_ram_gb"] = float(round(pick_vram * 1.2, 1))
    # Drop empty / None fields to keep the JSON tidy.
    return {k: v for k, v in entry.items() if v not in (None, "", [], {})}


def main():
    p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
    p.add_argument("--update-existing", action="store_true", help="Stamp min_vllm_version + vllm_recipe on existing rows.")
    p.add_argument("--add-missing", action="store_true", help="Add new rows for recipe models not in the catalog.")
    p.add_argument("--limit", type=int, default=0, help="Stop after N recipe fetches.")
    p.add_argument("--dry-run", action="store_true", help="Don't write back; just report.")
    p.add_argument("--sleep", type=float, default=0.05, help="Seconds between HTTP requests.")
    args = p.parse_args()
    if not args.update_existing and not args.add_missing:
        args.update_existing = args.add_missing = True

    with CATALOG_PATH.open(encoding="utf-8") as f:
        catalog = json.load(f)
    by_name = {m.get("name"): m for m in catalog if m.get("name")}

    client = httpx.Client(follow_redirects=True)
    print(f"Catalog: {CATALOG_PATH} ({len(catalog)} entries)")
    print("Fetching upstream manifest…")
    try:
        manifest = _fetch_manifest(client)
    except Exception as e:
        print(f"FATAL: manifest fetch failed: {e}", file=sys.stderr)
        sys.exit(2)
    print(f"Manifest: {len(manifest)} recipes")

    existing = sorted(by_name.keys() & manifest)
    missing = sorted(manifest - by_name.keys())
    print(f"Match catalog ↔ manifest: existing={len(existing)} missing={len(missing)}")

    targets: list[tuple[str, str]] = []  # (repo, action)
    if args.update_existing:
        targets.extend((r, "update") for r in existing)
    if args.add_missing:
        targets.extend((r, "add") for r in missing)
    if args.limit:
        targets = targets[: args.limit]
    print(f"Targets: {len(targets)}")

    hf_api = HfApi(token=os.environ.get("HF_TOKEN") or None) if HfApi else None
    updated = added = skipped = 0
    started = time.time()

    for n, (repo, action) in enumerate(targets, 1):
        recipe = _fetch_recipe(client, repo)
        if not recipe:
            print(f"[{n}/{len(targets)}] {repo:55} skip (no recipe fetched)")
            skipped += 1
            time.sleep(args.sleep)
            continue
        if action == "update":
            entry = by_name[repo]
            if _stamp_from_recipe(entry, recipe):
                updated += 1
                print(f"[{n}/{len(targets)}] {repo:55} updated")
            else:
                print(f"[{n}/{len(targets)}] {repo:55} unchanged")
        else:  # add
            hf_info = None
            if hf_api:
                try:
                    hf_info = hf_api.model_info(repo, files_metadata=False)
                except HfHubHTTPError as e:
                    code = getattr(getattr(e, "response", None), "status_code", "?")
                    print(f"  HF {code} for {repo} — building from recipe only", file=sys.stderr)
                except Exception as e:
                    print(f"  HF error for {repo}: {e}", file=sys.stderr)
            new_entry = _build_new_entry(repo, recipe, hf_info)
            if new_entry:
                catalog.append(new_entry)
                by_name[repo] = new_entry
                added += 1
                print(f"[{n}/{len(targets)}] {repo:55} added ({new_entry.get('parameter_count','?')}, {new_entry.get('quantization','?')})")
            else:
                skipped += 1
                print(f"[{n}/{len(targets)}] {repo:55} skip (couldn't build entry)")
        time.sleep(args.sleep)

    elapsed = time.time() - started
    print()
    print(f"Done in {elapsed:.1f}s — added={added}, updated={updated}, skipped={skipped}")

    if args.dry_run:
        print("Dry run — no write.")
        return
    if added or updated:
        tmp = CATALOG_PATH.with_suffix(".json.tmp")
        with tmp.open("w", encoding="utf-8") as f:
            json.dump(catalog, f, indent=1, ensure_ascii=False)
            f.write("\n")
        tmp.replace(CATALOG_PATH)
        print(f"Wrote {CATALOG_PATH} ({len(catalog)} entries)")
    else:
        print("No changes — catalog untouched.")


if __name__ == "__main__":
    main()