Odysseus v1.0

2026-06-17 02:05:22 -04:00 · 2026-05-31 23:58:26 +09:00
commit e5c99a5eee
421 changed files with 271349 additions and 0 deletions
@@ -0,0 +1,463 @@
+import re
+
+from services.hwfit.models import (
+    params_b, estimate_memory_gb, infer_use_case,
+    get_models, is_prequantized, _active_params_b, QUANT_BYTES_PER_PARAM,
+    QUANT_SPEED_MULT, QUANT_QUALITY_PENALTY,
+)
+
+GPU_BANDWIDTH = {
+    "5090": 1792, "5080": 960, "5070 ti": 896, "5070": 672, "5060 ti": 448, "5060": 256,
+    "4090": 1008, "4080 super": 736, "4080": 717, "4070 ti super": 672, "4070 ti": 504, "4070 super": 504, "4070": 504, "4060 ti": 288, "4060": 272,
+    "3090 ti": 1008, "3090": 936, "3080 ti": 912, "3080": 760, "3070 ti": 608, "3070": 448, "3060 ti": 448, "3060": 360,
+    "2080 ti": 616, "2080 super": 496, "2080": 448, "2070 super": 448, "2070": 448, "2060 super": 448, "2060": 336,
+    "1660 ti": 288, "1660 super": 336, "1660": 192, "1650 super": 192, "1650": 128,
+    "h100 sxm": 3350, "h100": 2039, "h200": 4800, "a100 sxm": 2039, "a100": 1555,
+    "l40s": 864, "l40": 864, "l4": 300, "a10g": 600, "a10": 600, "t4": 320,
+    "v100 sxm": 900, "v100": 897, "a6000": 768, "a5000": 768, "a4000": 448,
+    "7900 xtx": 960, "7900 xt": 800, "7900 gre": 576, "7800 xt": 624, "7700 xt": 432, "7600": 288,
+    "6950 xt": 576, "6900 xt": 512, "6800 xt": 512, "6800": 512, "6700 xt": 384, "6600 xt": 256, "6600": 224,
+    "mi300x": 5300, "mi300": 5300, "mi250x": 3277, "mi250": 3277, "mi210": 1638, "mi100": 1229,
+    "9070 xt": 624, "9070": 488,
+}
+
+# Pre-sort keys by length descending for correct substring matching
+_BW_KEYS_SORTED = sorted(GPU_BANDWIDTH.keys(), key=len, reverse=True)
+
+FALLBACK_K = {"cuda": 220, "rocm": 180, "cpu_x86": 70, "cpu_arm": 90}
+
+USE_CASE_WEIGHTS = {
+    "general":    (0.45, 0.30, 0.15, 0.10),
+    "coding":     (0.50, 0.20, 0.15, 0.15),
+    "reasoning":  (0.55, 0.15, 0.15, 0.15),
+    "chat":       (0.40, 0.35, 0.15, 0.10),
+    "multimodal": (0.50, 0.20, 0.15, 0.15),
+    "embedding":  (0.30, 0.40, 0.20, 0.10),
+    "tts":        (0.40, 0.35, 0.15, 0.10),
+    "stt":        (0.40, 0.35, 0.15, 0.10),
+}
+
+SPEED_TARGET = {
+    "general": 40, "coding": 40, "multimodal": 40, "chat": 40,
+    "reasoning": 25, "embedding": 200, "tts": 40, "stt": 40,
+}
+
+CONTEXT_TARGET = {
+    "general": 4096, "chat": 4096, "coding": 8192,
+    "reasoning": 8192, "multimodal": 4096, "embedding": 512,
+    "tts": 2048, "stt": 2048,
+}
+
+
+def _lookup_bandwidth(gpu_name):
+    if not gpu_name:
+        return None
+    gn = gpu_name.lower()
+    for key in _BW_KEYS_SORTED:
+        if key in gn:
+            return GPU_BANDWIDTH[key]
+    return None
+
+
+def _estimate_speed(model, quant, run_mode, system):
+    """Estimate tok/s. Uses active params for MoE (only active experts run per token)."""
+    pb = _active_params_b(model)
+    is_moe = model.get("is_moe", False)
+    bw = _lookup_bandwidth(system.get("gpu_name"))
+    backend = system.get("backend", "cpu_x86")
+
+    if bw and run_mode in ("gpu", "cpu_offload"):
+        bpp = QUANT_BYTES_PER_PARAM.get(quant, 0.5)
+        model_gb = pb * bpp
+        if model_gb <= 0:
+            return 0.0
+        efficiency = 0.55
+        raw_tps = (bw / model_gb) * efficiency
+        if run_mode == "cpu_offload":
+            mode_factor = 0.5
+        elif is_moe:
+            mode_factor = 0.8
+        else:
+            mode_factor = 1.0
+        return raw_tps * mode_factor
+
+    k = FALLBACK_K.get(backend, 70)
+    if pb <= 0:
+        return 0.0
+    sm = QUANT_SPEED_MULT.get(quant, 1.0)
+    return k / pb * sm
+
+
+def _quality_score(model, quant, use_case):
+    pb = params_b(model)
+    if pb < 1:
+        base = 30
+    elif pb < 3:
+        base = 45
+    elif pb < 7:
+        base = 60
+    elif pb < 10:
+        base = 75
+    elif pb < 20:
+        base = 82
+    elif pb < 40:
+        base = 89
+    else:
+        base = 95
+
+    name_lower = model.get("name", "").lower()
+    if "qwen" in name_lower:
+        base += 2
+    if "deepseek" in name_lower:
+        base += 3
+    if "llama" in name_lower:
+        base += 2
+    if "mistral" in name_lower or "mixtral" in name_lower:
+        base += 1
+    if "gemma" in name_lower:
+        base += 1
+
+    base += QUANT_QUALITY_PENALTY.get(quant, 0)
+
+    model_uc = infer_use_case(model)
+    if model_uc == "coding" and use_case == "coding":
+        base += 6
+    if model_uc == "reasoning" and use_case == "reasoning" and pb >= 13:
+        base += 5
+    if model_uc == "multimodal" and use_case == "multimodal":
+        base += 6
+
+    return max(0, min(100, base))
+
+
+def _speed_score(tps, use_case):
+    target = SPEED_TARGET.get(use_case, 40)
+    return max(0, min(100, (tps / target) * 100))
+
+
+def _fit_score(required, available):
+    if required > available:
+        return 0
+    if available <= 0:
+        return 0
+    ratio = required / available
+    if ratio <= 0.5:
+        return 60 + (ratio / 0.5) * 40
+    if ratio <= 0.8:
+        return 100
+    if ratio <= 0.9:
+        return 70
+    return 50
+
+
+def _context_score(ctx, use_case):
+    target = CONTEXT_TARGET.get(use_case, 4096)
+    if ctx >= target:
+        return 100
+    if ctx >= target / 2:
+        return 70
+    return 30
+
+
+def _try_quant_at(model, quant, ctx, gpu_vram, available_ram):
+    """Try a specific quant at a given context. Returns (run_mode, quant, ctx, mem) or None."""
+    mem = estimate_memory_gb(model, quant, ctx)
+    if gpu_vram > 0 and mem <= gpu_vram:
+        return "gpu", quant, ctx, mem
+    if gpu_vram > 0 and mem <= available_ram:
+        return "cpu_offload", quant, ctx, mem
+    if gpu_vram <= 0 and mem <= available_ram:
+        return "cpu_only", quant, ctx, mem
+    # Try halving context
+    cur_ctx = ctx // 2
+    while cur_ctx >= 1024:
+        mem = estimate_memory_gb(model, quant, cur_ctx)
+        if gpu_vram > 0 and mem <= gpu_vram:
+            return "gpu", quant, cur_ctx, mem
+        if mem <= available_ram:
+            return ("cpu_offload" if gpu_vram > 0 else "cpu_only"), quant, cur_ctx, mem
+        cur_ctx //= 2
+    return None
+
+
+def _quant_bits(q):
+    """Approximate bit-width of a quant label so GGUF quant tiers (Q4/Q8/…) can
+    be matched against prequantized formats (AWQ 4, AWQ-8bit, FP8, GPTQ-4bit…).
+    Returns 0 when unknown (caller treats unknown as "don't filter")."""
+    qu = (q or "").upper().replace("-", "").replace("_", "").replace(" ", "")
+    # GGUF k-quants + float formats
+    if qu.startswith("Q8") or "FP8" in qu:
+        return 8
+    if qu.startswith("Q4") or qu.startswith("IQ4"):
+        return 4
+    if qu.startswith("Q2") or qu.startswith("IQ2"):
+        return 2
+    if qu.startswith("Q3") or qu.startswith("IQ3"):
+        return 3
+    if qu.startswith("Q5"):
+        return 5
+    if qu.startswith("Q6"):
+        return 6
+    if qu.startswith("F16") or qu.startswith("BF16") or qu.startswith("F32"):
+        return 16
+    # Prequantized formats: pull the bit-width digit (AWQ4 / AWQ4BIT / GPTQ8 / 4BIT / INT8 …)
+    m = re.search(r"(?:AWQ|GPTQ|MLX|EXL2|BNB|INT|W)(\d{1,2})", qu) or re.search(r"(\d{1,2})BIT", qu)
+    if m:
+        b = int(m.group(1))
+        if 2 <= b <= 16:
+            return b
+    return 0
+
+
+def analyze_model(model, system, target_quant=None):
+    pb = params_b(model)
+    if pb <= 0:
+        return None
+
+    use_case = infer_use_case(model)
+    has_gpu = system.get("has_gpu", False)
+    gpu_vram = (system.get("gpu_vram_gb") or 0) if has_gpu else 0
+    gpu_count = system.get("gpu_count", 1) or 1
+    single_gpu_vram = gpu_vram / gpu_count if gpu_count > 1 else gpu_vram
+    available_ram = system.get("available_ram_gb", 0)
+    # When the user has explicitly picked a GPU config (not RAM mode), they want
+    # to see what runs ON the GPU(s) — not big models that only "fit" by spilling
+    # most layers to system RAM. Zeroing the offload budget makes _try_quant_at
+    # take only its GPU branches (fit on VRAM, shrinking context if needed),
+    # otherwise return None. Fixes "96 GB GPU still lists a 175 GB model".
+    gpu_only = bool(system.get("gpu_only")) and has_gpu and gpu_vram > 0
+    eff_ram = 0 if gpu_only else available_ram
+    is_moe = model.get("is_moe", False)
+    ctx = model.get("context_length", 4096) or 4096
+
+    native_quant = model.get("quantization", "Q4_K_M")
+    preq = is_prequantized(model)
+
+    # GGUF models can't be sharded across GPUs — use single GPU VRAM
+    is_gguf = bool(model.get("gguf_sources"))
+    quant_upper = (native_quant or "").upper()
+    is_gguf_quant = any(quant_upper.startswith(p) for p in ("Q2", "Q3", "Q4", "Q5", "Q6", "Q8", "IQ", "F16", "F32"))
+    # Single-GPU VRAM only applies to GGUF/dense builds (llama.cpp can't shard
+    # across GPUs). Prequantized formats (AWQ/GPTQ/FP8) are served sharded by
+    # vLLM across all GPUs, so they get the FULL multi-GPU VRAM — even when the
+    # model also lists a GGUF alternate download (gguf_sources).
+    if (is_gguf or is_gguf_quant) and not preq:
+        effective_vram = single_gpu_vram
+    else:
+        effective_vram = gpu_vram
+
+    # Determine which quant to evaluate at
+    if preq:
+        # AWQ/GPTQ/FP8/MLX come at a fixed bit-width. If the user picked a
+        # specific quant tier (e.g. Q8 → 8-bit), only keep prequant models whose
+        # native bit-width matches — otherwise selecting Q8 would still surface
+        # AWQ-4bit models, mixing 4- and 8-bit in one view.
+        if target_quant:
+            _tb, _nb = _quant_bits(target_quant), _quant_bits(native_quant)
+            if _tb and _nb and _tb != _nb:
+                return None
+        quant_to_try = native_quant
+    elif target_quant:
+        # User picked a specific quant
+        quant_to_try = target_quant
+    else:
+        # Default: Q4_K_M (user's stated preference)
+        quant_to_try = "Q4_K_M"
+
+    result = _try_quant_at(model, quant_to_try, ctx, effective_vram, eff_ram)
+
+    # If target quant doesn't fit and it's not pre-quantized, try lower quants
+    if result is None and not preq and target_quant:
+        from services.hwfit.models import QUANT_HIERARCHY
+        idx = QUANT_HIERARCHY.index(target_quant) if target_quant in QUANT_HIERARCHY else -1
+        for q in QUANT_HIERARCHY[idx + 1:]:
+            result = _try_quant_at(model, q, ctx, effective_vram, eff_ram)
+            if result:
+                break
+
+    if result is None:
+        # Model doesn't fit on the user's current hardware. Surface it
+        # anyway with a "too_tight" badge instead of silently dropping
+        # it — without this, editing the hardware config to try LARGER
+        # tiers never revealed the bigger models, because they were
+        # filtered out before the user could see what would fit. The
+        # client already knows how to render too_tight (red row).
+        oversized_required = estimate_memory_gb(model, quant_to_try, ctx)
+        return {
+            "name": model.get("name"),
+            "provider": model.get("provider"),
+            "parameter_count": model.get("parameter_count"),
+            "params_b": round(pb, 1),
+            "is_moe": is_moe,
+            "use_case": use_case,
+            "fit_level": "too_tight",
+            "run_mode": "no_fit",
+            "quant": quant_to_try,
+            "context": ctx,
+            "required_gb": round(oversized_required, 1),
+            "speed_tps": 0,
+            "score": 0,
+            "scores": {"quality": 0, "speed": 0, "fit": 0, "context": 0},
+            "gguf_sources": model.get("gguf_sources", []),
+            "context_length": model.get("context_length", 4096),
+        }
+
+    run_mode, quant, fit_ctx, required_gb = result
+
+    # Determine fit level
+    budget = effective_vram if run_mode == "gpu" else available_ram
+    if required_gb > budget:
+        return None
+    if run_mode == "gpu":
+        rec = model.get("recommended_ram_gb") or required_gb
+        if rec <= gpu_vram:
+            fit_level = "perfect"
+        elif gpu_vram >= required_gb * 1.2:
+            fit_level = "good"
+        else:
+            fit_level = "marginal"
+    elif run_mode == "cpu_offload":
+        fit_level = "good" if available_ram >= required_gb * 1.2 else "marginal"
+    else:
+        fit_level = "marginal"
+
+    tps = _estimate_speed(model, quant, run_mode, system)
+
+    q_score = _quality_score(model, quant, use_case)
+    s_score = _speed_score(tps, use_case)
+    f_score = _fit_score(required_gb, budget)
+    c_score = _context_score(fit_ctx, use_case)
+
+    wq, ws, wf, wc = USE_CASE_WEIGHTS.get(use_case, (0.45, 0.30, 0.15, 0.10))
+    composite = q_score * wq + s_score * ws + f_score * wf + c_score * wc
+
+    return {
+        "name": model.get("name"),
+        "provider": model.get("provider"),
+        "parameter_count": model.get("parameter_count"),
+        "params_b": round(pb, 1),
+        "is_moe": is_moe,
+        "use_case": use_case,
+        "fit_level": fit_level,
+        "run_mode": run_mode,
+        "quant": quant,
+        "context": fit_ctx,
+        "required_gb": round(required_gb, 1),
+        "speed_tps": round(tps, 1),
+        "score": round(composite, 1),
+        "scores": {
+            "quality": round(q_score, 1),
+            "speed": round(s_score, 1),
+            "fit": round(f_score, 1),
+            "context": round(c_score, 1),
+        },
+        "gguf_sources": model.get("gguf_sources", []),
+        "context_length": model.get("context_length", 4096),
+    }
+
+
+SORT_KEYS = {
+    "score": lambda r: r["score"],
+    "speed": lambda r: r["speed_tps"],
+    "vram": lambda r: r["required_gb"],
+    "params": lambda r: r["params_b"],
+    "context": lambda r: r["context"],
+}
+
+
+def rank_models(system, use_case=None, limit=50, search=None, sort="score", quant=None):
+    """Rank all models against detected hardware. Returns sorted list of fit results."""
+    models = get_models()
+    results = []
+
+    # Include image gen models only when explicitly filtered
+    if use_case == "image_gen":
+        try:
+            from services.hwfit.image_models import rank_image_models
+        except ImportError:
+            rank_image_models = None
+        if rank_image_models:
+            img_results = rank_image_models(system, search=search)
+        else:
+            img_results = []
+        for im in img_results:
+            fit_map = {"perfect": "perfect", "good": "good", "tight": "marginal", "no_fit": "too_tight", "no_gpu": "too_tight"}
+            results.append({
+                "name": im["id"],
+                "provider": im["provider"],
+                "parameter_count": f"{im['params_b']}B",
+                "params_b": im["params_b"],
+                "is_moe": False,
+                "use_case": "image_gen",
+                "fit_level": fit_map.get(im["fit"], "too_tight"),
+                "run_mode": "gpu" if im["fits"] else "no_fit",
+                "quant": im.get("quant", "BF16"),
+                "context": 0,
+                "context_length": 0,
+                "required_gb": round(im.get("vram_needed") or 0, 1),
+                "speed_tps": 0,
+                "score": float(im["score"]),
+                "scores": {"quality": float(im["quality"]), "speed": float(im["speed"]), "fit": 0, "context": 0},
+                "gguf_sources": [],
+                "is_image_gen": True,
+                "capabilities": im.get("capabilities", []),
+                "description": im.get("description", ""),
+            })
+        if use_case == "image_gen":
+            sort_fn = SORT_KEYS.get(sort, SORT_KEYS["score"])
+            results.sort(key=sort_fn, reverse=(sort != "vram"))
+            return results[:limit]
+
+    # If user picked a prequantized format (AWQ/FP8/GPTQ), filter to only those models
+    filter_native = quant and any(quant.startswith(p) for p in ("AWQ-", "GPTQ-", "FP8"))
+
+    # MLX-quantized models only run on Apple Silicon (Metal). Exclude them on
+    # every other backend (CUDA / ROCm / CPU) so Linux/Windows users don't see
+    # unrunnable suggestions.
+    system_backend = (system.get("backend") or "").lower()
+    apple_silicon = system_backend in ("mps", "metal", "apple")
+
+    for m in models:
+        native_q = m.get("quantization", "")
+
+        # Drop MLX models on non-Apple hardware
+        if not apple_silicon and native_q.startswith("mlx-"):
+            continue
+
+        # Format filter: AWQ tab → only AWQ models, FP8 tab → only FP8 models
+        if filter_native:
+            if quant == "FP8" and native_q != "FP8":
+                continue
+            if quant.startswith("AWQ") and not native_q.startswith("AWQ"):
+                continue
+            if quant.startswith("GPTQ") and not native_q.startswith("GPTQ"):
+                continue
+
+        if search:
+            name = m.get("name", "").lower()
+            provider = m.get("provider", "").lower()
+            if search.lower() not in name and search.lower() not in provider:
+                continue
+
+        result = analyze_model(m, system, target_quant=quant)
+        if result is None:
+            continue
+
+        if use_case:
+            model_uc = infer_use_case(m)
+            if use_case != model_uc and use_case != "general":
+                continue
+
+        results.append(result)
+
+    # Pick the visible SET by best fit (score) first, so it stays the same no
+    # matter which column the user sorts by — otherwise sorting by params would
+    # truncate to the N biggest models (huge ones that don't even fit) while
+    # sorting by vram showed the N smallest. Only AFTER choosing the set do we
+    # order it by the requested column.
+    results.sort(key=SORT_KEYS["score"], reverse=True)
+    results = results[:limit]
+    sort_fn = SORT_KEYS.get(sort, SORT_KEYS["score"])
+    # vram ascending (smallest first), everything else descending (biggest first)
+    results.sort(key=sort_fn, reverse=(sort != "vram"))
+    return results
@@ -0,0 +1,457 @@
+import os
+import platform
+import subprocess
+import time
+
+CACHE_TTL = 1800  # 30 min — hardware rarely changes; use the Rescan button to force a re-probe
+
+
+_remote_host = None  # set by detect_system(host=...)
+_remote_port = None  # set by detect_system(ssh_port=...)
+_remote_platform = None  # set by detect_system(platform=...): "windows", "linux", "termux"
+_last_gpu_error = None  # set by _detect_nvidia() when nvidia-smi errors (driver mismatch, etc.)
+
+
+def _run(cmd):
+    try:
+        if _remote_host:
+            # Run command on remote host via SSH
+            if isinstance(cmd, list):
+                cmd_str = " ".join(cmd)
+            else:
+                cmd_str = cmd
+            ssh_cmd = ["ssh", "-o", "ConnectTimeout=5", "-o", "StrictHostKeyChecking=no"]
+            if _remote_port and _remote_port != "22":
+                ssh_cmd += ["-p", _remote_port]
+            ssh_cmd += [_remote_host, cmd_str]
+            r = subprocess.run(
+                ssh_cmd,
+                capture_output=True, text=True, timeout=15,
+            )
+        else:
+            r = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
+        if r.returncode == 0:
+            return r.stdout.strip()
+    except Exception:
+        pass
+    return None
+
+
+def _group_gpus(gpus):
+    """Group identical GPUs by (name, rounded VRAM).
+
+    vLLM tensor-parallel only works across IDENTICAL GPUs, so a mixed box must
+    be split into homogeneous pools. Each group carries the device indices so a
+    serve command can pin CUDA_VISIBLE_DEVICES to exactly one pool. Biggest pool
+    (by total VRAM) first — that's the sensible auto-default serving target.
+    """
+    groups = {}
+    order = []
+    for g in gpus:
+        key = (g["name"], round(g["vram_gb"]))
+        if key not in groups:
+            groups[key] = {
+                "name": g["name"],
+                "vram_each": round(g["vram_gb"], 1),
+                "count": 0,
+                "indices": [],
+            }
+            order.append(key)
+        groups[key]["count"] += 1
+        groups[key]["indices"].append(g.get("index"))
+    out = []
+    for key in order:
+        grp = groups[key]
+        grp["vram_total"] = round(grp["vram_each"] * grp["count"], 1)
+        out.append(grp)
+    out.sort(key=lambda x: x["vram_total"], reverse=True)
+    return out
+
+
+def _detect_nvidia():
+    global _last_gpu_error
+    _last_gpu_error = None
+    out = _run(["nvidia-smi", "--query-gpu=memory.total,name", "--format=csv,noheader,nounits"])
+    # Remote fallback: a non-interactive SSH shell often has a minimal PATH
+    # that omits where nvidia-smi lives (/usr/bin, /usr/local/cuda/bin), so the
+    # first call silently returns nothing → "No GPU" on hosts that DO have GPUs.
+    # Retry through a login shell with the common CUDA bin dirs on PATH.
+    if not out and _remote_host:
+        out = _run(
+            "bash -lc 'export PATH=\"$PATH:/usr/bin:/usr/local/bin:/usr/local/cuda/bin\"; "
+            "nvidia-smi --query-gpu=memory.total,name --format=csv,noheader,nounits'"
+        )
+    # Last resort: call nvidia-smi by absolute path. Some hosts have a login
+    # shell that isn't bash (or a profile that errors), so the bash -lc retry
+    # above still comes back empty even though the binary is right there.
+    if not out and _remote_host:
+        for _p in ("/usr/bin/nvidia-smi", "/usr/local/bin/nvidia-smi", "/usr/local/cuda/bin/nvidia-smi"):
+            out = _run(f"{_p} --query-gpu=memory.total,name --format=csv,noheader,nounits")
+            if out:
+                break
+    if not out:
+        return None
+
+    # nvidia-smi present but unable to talk to the driver (e.g. it was updated
+    # without a reboot). It prints an error and no GPU rows — surface that as a
+    # driver error rather than the misleading "No GPU".
+    _low = out.lower()
+    if ("nvml" in _low or "driver/library version mismatch" in _low
+            or "couldn't communicate" in _low or "no devices were found" in _low
+            or "failed to initialize" in _low):
+        _last_gpu_error = out.strip().split("\n")[0][:140] or "NVIDIA driver error"
+        return None
+
+    gpus = []
+    # nvidia-smi lists GPUs in index order (0,1,2,...), so the row position is
+    # the CUDA device index we'd pass to CUDA_VISIBLE_DEVICES.
+    for idx, line in enumerate(out.strip().split("\n")):
+        parts = [p.strip() for p in line.split(",")]
+        if len(parts) >= 2:
+            try:
+                vram_mb = float(parts[0])
+                gpus.append({"index": idx, "name": parts[1], "vram_gb": vram_mb / 1024.0})
+            except ValueError:
+                continue
+
+    if not gpus:
+        return None
+    total_vram = sum(g["vram_gb"] for g in gpus)
+    groups = _group_gpus(gpus)
+    return {
+        "gpu_name": gpus[0]["name"],
+        "gpu_vram_gb": round(total_vram, 1),
+        "gpu_count": len(gpus),
+        "gpus": gpus,
+        "gpu_groups": groups,
+        "homogeneous": len(groups) <= 1,
+        "backend": "cuda",
+    }
+
+
+def _detect_amd():
+    """Detect AMD GPUs. Handles both discrete cards (with mem_info_vram_total)
+    and APUs / unified-memory SoCs like Strix Halo (which expose
+    mem_info_vis_vram_total instead, or only mem_info_gtt_total)."""
+    def _read(path):
+        if _remote_host:
+            val = _run(["cat", path])
+            return val.strip() if val else None
+        try:
+            with open(path) as f:
+                return f.read().strip()
+        except Exception:
+            return None
+
+    def _list_drm_cards():
+        if _remote_host:
+            out = _run(["ls", "/sys/class/drm"])
+            if not out:
+                return []
+            return [e for e in out.split() if e.startswith("card") and "-" not in e]
+        try:
+            return [e for e in os.listdir("/sys/class/drm") if e.startswith("card") and "-" not in e]
+        except Exception:
+            return []
+
+    try:
+        cards = []
+        is_apu = False
+        for _cidx, entry in enumerate(_list_drm_cards()):
+            base = f"/sys/class/drm/{entry}/device"
+            vendor = _read(f"{base}/vendor")
+            if vendor != "0x1002":
+                continue
+            # Discrete cards usually report real VRAM in mem_info_vram_total,
+            # while some AMD APUs / Docker views expose a tiny vram_total and
+            # the usable pool in vis_vram_total. Use the larger of those two;
+            # only fall back to GTT if neither VRAM field is available.
+            vram_raw = _read(f"{base}/mem_info_vram_total")
+            vis_raw = _read(f"{base}/mem_info_vis_vram_total")
+            gtt_raw = _read(f"{base}/mem_info_gtt_total")
+            vram_val = int(vram_raw) if vram_raw and vram_raw.isdigit() else 0
+            vis_val = int(vis_raw) if vis_raw and vis_raw.isdigit() else 0
+            gtt_val = int(gtt_raw) if gtt_raw and gtt_raw.isdigit() else 0
+            vram_bytes = max(vram_val, vis_val)
+            if vram_bytes <= 0:
+                vram_bytes = gtt_val
+            if vis_val and vis_val >= vram_val:
+                is_apu = True
+            if vram_bytes <= 0:
+                continue
+            name = _read(f"{base}/product_name") or f"AMD GPU ({entry})"
+            cards.append({"index": _cidx, "name": name, "vram_gb": vram_bytes / (1024**3)})
+
+        if not cards:
+            return None
+        total_vram = sum(c["vram_gb"] for c in cards)
+        groups = _group_gpus(cards)
+        # NOTE: for APUs with BIOS UMA carveout (e.g. Strix Halo), vis_vram_total
+        # is the real usable GPU memory — it's physically backed but reserved
+        # by BIOS so it doesn't appear in /proc/meminfo. Don't cap it at system
+        # RAM: the two pools are separate from the OS's perspective.
+        return {
+            "gpu_name": cards[0]["name"],
+            "gpu_vram_gb": round(total_vram, 1),
+            "gpu_count": len(cards),
+            "gpus": cards,
+            "gpu_groups": groups,
+            "homogeneous": len(groups) <= 1,
+            "backend": "rocm",
+            "unified_memory": is_apu,
+        }
+    except Exception:
+        return None
+
+
+def _read_file(path):
+    """Read a file, locally or via SSH."""
+    if _remote_host:
+        return _run(["cat", path])
+    try:
+        with open(path) as f:
+            return f.read()
+    except Exception:
+        return None
+
+
+def _parse_meminfo():
+    """Parse /proc/meminfo into a dict of key -> KB values."""
+    text = _read_file("/proc/meminfo")
+    if not text:
+        return {}
+    result = {}
+    for line in text.split("\n"):
+        if ":" in line:
+            key, val = line.split(":", 1)
+            parts = val.strip().split()
+            if parts:
+                try:
+                    result[key.strip()] = int(parts[0])
+                except ValueError:
+                    pass
+    return result
+
+
+def _get_ram_gb():
+    meminfo = _parse_meminfo()
+    if "MemTotal" in meminfo:
+        return meminfo["MemTotal"] / (1024**2)
+
+    if not _remote_host:
+        try:
+            pages = os.sysconf("SC_PHYS_PAGES")
+            page_size = os.sysconf("SC_PAGE_SIZE")
+            if pages and page_size:
+                return (pages * page_size) / (1024**3)
+        except Exception:
+            pass
+    return 0.0
+
+
+def _get_available_ram_gb():
+    meminfo = _parse_meminfo()
+    if "MemAvailable" in meminfo:
+        return meminfo["MemAvailable"] / (1024**2)
+    return _get_ram_gb() * 0.7
+
+
+def _get_cpu_name():
+    text = _read_file("/proc/cpuinfo")
+    if text:
+        for line in text.split("\n"):
+            if line.startswith("model name"):
+                return line.split(":", 1)[1].strip()
+
+    if not _remote_host:
+        return platform.processor() or "unknown"
+    return "unknown"
+
+
+def _get_cpu_count():
+    if _remote_host:
+        out = _run(["nproc"])
+        if out:
+            try:
+                return int(out.strip())
+            except ValueError:
+                pass
+        # fallback: count "processor" lines in /proc/cpuinfo
+        text = _read_file("/proc/cpuinfo")
+        if text:
+            return sum(1 for line in text.split("\n") if line.startswith("processor"))
+    return os.cpu_count() or 1
+
+
+def _detect_windows():
+    """Detect Windows hardware in a single SSH call using PowerShell."""
+    # Single PowerShell command that gathers all hardware info at once
+    ps_cmd = (
+        "$r = @{}; "
+        "$os = Get-CimInstance Win32_OperatingSystem; "
+        "$r.ram_gb = [math]::Round($os.TotalVisibleMemorySize / 1048576, 1); "
+        "$r.avail_gb = [math]::Round($os.FreePhysicalMemory / 1048576, 1); "
+        "$cpu = Get-CimInstance Win32_Processor | Select-Object -First 1; "
+        "$r.cpu_name = $cpu.Name; "
+        "$r.cpu_cores = (Get-CimInstance Win32_Processor | Measure-Object -Property NumberOfLogicalProcessors -Sum).Sum; "
+        "$r.arch = $cpu.AddressWidth; "
+        # GPU detection via nvidia-smi (fastest) or WMI fallback
+        "try { "
+        "  $nv = nvidia-smi --query-gpu=memory.total,name --format=csv,noheader,nounits 2>$null; "
+        "  if ($LASTEXITCODE -eq 0 -and $nv) { "
+        "    $gpus = @(); "
+        "    foreach ($line in $nv -split \"`n\") { "
+        "      $p = $line -split ','; "
+        "      if ($p.Count -ge 2) { $gpus += @{name=$p[1].Trim(); vram_mb=[double]$p[0].Trim()} } "
+        "    }; "
+        "    $r.gpu_name = $gpus[0].name; "
+        "    $r.gpu_vram_gb = [math]::Round(($gpus | Measure-Object -Property vram_mb -Sum).Sum / 1024, 1); "
+        "    $r.gpu_count = $gpus.Count; "
+        "    $r.gpu_backend = 'cuda'; "
+        "  } "
+        "} catch {}; "
+        "if (-not $r.gpu_name) { "
+        "  $wmiGpu = Get-CimInstance Win32_VideoController | Where-Object { $_.AdapterRAM -gt 0 } | Select-Object -First 1; "
+        "  if ($wmiGpu) { "
+        "    $r.gpu_name = $wmiGpu.Name; "
+        "    $r.gpu_vram_gb = [math]::Round($wmiGpu.AdapterRAM / 1073741824, 1); "
+        "    $r.gpu_count = 1; "
+        "    $r.gpu_backend = 'cpu_x86'; "  # WMI doesn't tell us CUDA/ROCm
+        "  } "
+        "}; "
+        "$r | ConvertTo-Json -Compress"
+    )
+    out = _run(f'powershell -Command "{ps_cmd}"')
+    if not out:
+        return None
+    import json as _json
+    try:
+        d = _json.loads(out)
+        result = {
+            "total_ram_gb": d.get("ram_gb", 0),
+            "available_ram_gb": d.get("avail_gb", 0),
+            "cpu_cores": d.get("cpu_cores", 1),
+            "cpu_name": d.get("cpu_name", "unknown"),
+            "has_gpu": bool(d.get("gpu_name")),
+            "gpu_name": d.get("gpu_name"),
+            "gpu_vram_gb": d.get("gpu_vram_gb"),
+            "gpu_count": d.get("gpu_count", 0),
+            "backend": d.get("gpu_backend", "cpu_x86"),
+        }
+        # PowerShell only reports aggregate GPU info, not per-card detail, so we
+        # can't tell a mixed box from a uniform one here — assume one homogeneous
+        # pool spanning all reported GPUs (the common Windows case).
+        _n = result["gpu_count"] or 0
+        if result["has_gpu"] and _n > 0:
+            _each = round((result["gpu_vram_gb"] or 0) / _n, 1)
+            result["gpus"] = [
+                {"index": i, "name": result["gpu_name"], "vram_gb": _each} for i in range(_n)
+            ]
+            result["gpu_groups"] = [{
+                "name": result["gpu_name"],
+                "vram_each": _each,
+                "count": _n,
+                "indices": list(range(_n)),
+                "vram_total": result["gpu_vram_gb"],
+            }]
+            result["homogeneous"] = True
+        return result
+    except Exception:
+        return None
+
+
+_cache_by_host = {}  # host -> (timestamp, result)
+
+
+def detect_system(host="", ssh_port="", platform="", fresh=False):
+    """Detect system hardware: RAM, CPU, GPU. Cached per host (hardware rarely
+    changes, and probing a remote host over SSH is slow). Pass fresh=True to
+    bypass the cache and re-probe (the "Rescan" button).
+    If host is set (e.g. 'user@server'), runs detection commands over SSH.
+    platform: "windows", "linux", "termux", or "" (auto-detect).
+    """
+    global _remote_host, _remote_port, _remote_platform
+
+    cache_key = host or "_local"
+    now = time.time()
+    if not fresh and cache_key in _cache_by_host:
+        ts, cached = _cache_by_host[cache_key]
+        if (now - ts) < CACHE_TTL:
+            return cached
+
+    _remote_host = host or None
+    _remote_port = ssh_port or None
+    _remote_platform = platform or None
+
+    # Windows: single PowerShell command for all hardware info
+    if _remote_platform == "windows" and _remote_host:
+        result = _detect_windows()
+        if result:
+            _remote_host = None
+            _remote_platform = None
+            _cache_by_host[cache_key] = (now, result)
+            return result
+        # If Windows detection failed, return error
+        result = {"error": f"Cannot connect to {host}", "host": host}
+        _remote_host = None
+        _remote_platform = None
+        _cache_by_host[cache_key] = (now, result)
+        return result
+
+    # Linux/Termux: existing multi-command detection
+    total_ram = round(_get_ram_gb(), 1)
+    # If remote host returns 0 RAM, connection likely failed
+    if _remote_host and total_ram <= 0:
+        result = {"error": f"Cannot connect to {host}", "host": host}
+        _cache_by_host[cache_key] = (now, result)
+        _remote_host = None
+        _remote_platform = None
+        return result
+    available_ram = round(_get_available_ram_gb(), 1)
+    cpu_cores = _get_cpu_count()
+    cpu_name = _get_cpu_name()
+
+    gpu_info = _detect_nvidia() or _detect_amd()
+
+    if gpu_info:
+        result = {
+            "total_ram_gb": total_ram,
+            "available_ram_gb": available_ram,
+            "cpu_cores": cpu_cores,
+            "cpu_name": cpu_name,
+            "has_gpu": True,
+            "gpu_name": gpu_info["gpu_name"],
+            "gpu_vram_gb": gpu_info["gpu_vram_gb"],
+            "gpu_count": gpu_info["gpu_count"],
+            "gpus": gpu_info.get("gpus", []),
+            "gpu_groups": gpu_info.get("gpu_groups", []),
+            "homogeneous": gpu_info.get("homogeneous", True),
+            "backend": gpu_info["backend"],
+        }
+    else:
+        if _remote_host:
+            arch_out = _run(["uname", "-m"]) or ""
+        else:
+            import platform as _platform
+            arch_out = _platform.machine().lower()
+        backend = "cpu_arm" if "aarch64" in arch_out or "arm" in arch_out else "cpu_x86"
+        result = {
+            "total_ram_gb": total_ram,
+            "available_ram_gb": available_ram,
+            "cpu_cores": cpu_cores,
+            "cpu_name": cpu_name,
+            "has_gpu": False,
+            "gpu_name": None,
+            "gpu_vram_gb": None,
+            "gpu_count": 0,
+            "backend": backend,
+            # Set when nvidia-smi exists but failed (e.g. driver/library
+            # version mismatch) — lets the UI say "GPU driver error" instead
+            # of the misleading "No GPU".
+            "gpu_error": _last_gpu_error,
+        }
+
+    _remote_host = None
+    _remote_platform = None
+    _cache_by_host[cache_key] = (now, result)
+    return result
@@ -0,0 +1,374 @@
+"""Image generation model registry and VRAM fitting for Cookbook."""
+
+# Curated registry of image generation models supported by diffusers.
+# ONLY verified HuggingFace repo IDs.
+# VRAM estimates are for inference (single image generation).
+IMAGE_MODEL_REGISTRY = [
+    # ── Z-Image (Alibaba Tongyi) ──
+    {
+        "id": "Tongyi-MAI/Z-Image-Turbo",
+        "name": "Z-Image Turbo",
+        "provider": "Tongyi",
+        "params_b": 6.0,
+        "vram_bf16": 19.0,
+        "vram_fp8": 10.0,
+        "vram_q4": 6.0,
+        "default_quant": "BF16",
+        "quant_repos": {
+            "FP8": "drbaph/Z-Image-Turbo-FP8",
+        },
+        "capabilities": ["text-to-image"],
+        "description": "6B distilled, 8-step. Sub-second on H800. Apache 2.0.",
+        "quality": 92,
+        "speed": 95,
+        "released": "2025-12",
+    },
+    {
+        "id": "Tongyi-MAI/Z-Image",
+        "name": "Z-Image",
+        "provider": "Tongyi",
+        "params_b": 6.0,
+        "vram_bf16": 19.0,
+        "vram_fp8": 10.0,
+        "vram_q4": 6.0,
+        "default_quant": "BF16",
+        "quant_repos": {
+            "FP8": "drbaph/Z-Image-fp8",
+        },
+        "capabilities": ["text-to-image"],
+        "description": "Full undistilled model. Highest creative freedom. Apache 2.0.",
+        "quality": 93,
+        "speed": 70,
+        "released": "2025-12",
+    },
+    # ── Qwen Image ──
+    {
+        "id": "Qwen/Qwen-Image-2512",
+        "name": "Qwen Image 2512",
+        "provider": "Qwen",
+        "params_b": 20.0,
+        "vram_bf16": 42.0,
+        "vram_fp8": 22.0,
+        "vram_q4": 14.0,
+        "default_quant": "FP8",
+        "quant_repos": {},
+        "capabilities": ["text-to-image", "text-rendering"],
+        "description": "Dec 2025 update. Better humans, finer detail, strong text. Apache 2.0.",
+        "quality": 95,
+        "speed": 50,
+        "released": "2025-12",
+    },
+    {
+        "id": "Qwen/Qwen-Image",
+        "name": "Qwen Image",
+        "provider": "Qwen",
+        "params_b": 20.0,
+        "vram_bf16": 42.0,
+        "vram_fp8": 22.0,
+        "vram_q4": 14.0,
+        "default_quant": "FP8",
+        "quant_repos": {},
+        "capabilities": ["text-to-image", "text-rendering"],
+        "description": "20B foundation. Best text rendering in images. Apache 2.0.",
+        "quality": 94,
+        "speed": 50,
+        "released": "2025-08",
+    },
+    {
+        "id": "Qwen/Qwen-Image-Edit-2511",
+        "name": "Qwen Image Edit",
+        "provider": "Qwen",
+        "params_b": 20.0,
+        "vram_bf16": 42.0,
+        "vram_fp8": 22.0,
+        "vram_q4": 14.0,
+        "default_quant": "FP8",
+        "quant_repos": {},
+        "capabilities": ["image-editing", "inpainting"],
+        "description": "Dedicated editing. Style transfer, object removal. Apache 2.0.",
+        "quality": 92,
+        "speed": 50,
+        "released": "2025-11",
+    },
+    # ── Stable Diffusion (dedicated inpainting) ──
+    {
+        "id": "diffusers/stable-diffusion-xl-1.0-inpainting-0.1",
+        "name": "SDXL Inpainting",
+        "provider": "Stability AI",
+        "params_b": 3.5,
+        "vram_bf16": 12.0,
+        "vram_fp8": 8.0,
+        "vram_q4": 6.0,
+        "default_quant": "BF16",
+        "quant_repos": {},
+        "capabilities": ["inpainting", "image-editing"],
+        "description": "SDXL fine-tuned for inpainting (9-channel UNet). Best SD-family fill quality; fits a 24GB card comfortably.",
+        "quality": 86,
+        "speed": 68,
+        "released": "2023-11",
+    },
+    {
+        "id": "stable-diffusion-v1-5/stable-diffusion-inpainting",
+        "name": "SD 1.5 Inpainting",
+        "provider": "Stability AI",
+        "params_b": 1.1,
+        "vram_bf16": 4.0,
+        "vram_fp8": 3.0,
+        "vram_q4": 2.5,
+        "default_quant": "BF16",
+        "quant_repos": {},
+        "capabilities": ["inpainting"],
+        "description": "Classic SD 1.5 inpaint. Very light and fast; lower fidelity than SDXL.",
+        "quality": 70,
+        "speed": 92,
+        "released": "2022-10",
+    },
+    # ── FLUX ──
+    {
+        "id": "black-forest-labs/FLUX.1-dev",
+        "name": "FLUX.1 Dev",
+        "provider": "Black Forest Labs",
+        "params_b": 12.0,
+        "vram_bf16": 33.0,
+        "vram_fp8": 17.0,
+        "vram_q4": 10.0,
+        "default_quant": "FP8",
+        "quant_repos": {
+            "FP8": "diffusers/FLUX.1-dev-torchao-fp8",
+        },
+        "capabilities": ["text-to-image"],
+        "description": "High quality, detailed. Popular community model. Non-commercial.",
+        "quality": 92,
+        "speed": 55,
+        "released": "2024-08",
+    },
+    {
+        "id": "black-forest-labs/FLUX.1-schnell",
+        "name": "FLUX.1 Schnell",
+        "provider": "Black Forest Labs",
+        "params_b": 12.0,
+        "vram_bf16": 33.0,
+        "vram_fp8": 17.0,
+        "vram_q4": 10.0,
+        "default_quant": "FP8",
+        "quant_repos": {
+            "FP8": "Kijai/flux-fp8",
+        },
+        "capabilities": ["text-to-image"],
+        "description": "Fast 4-step variant. Apache 2.0 license.",
+        "quality": 85,
+        "speed": 90,
+        "released": "2024-08",
+    },
+    # ── Stable Diffusion ──
+    {
+        "id": "stabilityai/stable-diffusion-3.5-medium",
+        "name": "SD 3.5 Medium",
+        "provider": "Stability AI",
+        "params_b": 2.5,
+        "vram_bf16": 12.0,
+        "vram_fp8": 7.0,
+        "vram_q4": None,
+        "default_quant": "BF16",
+        "quant_repos": {
+            "FP8": "Comfy-Org/stable-diffusion-3.5-fp8",
+        },
+        "capabilities": ["text-to-image"],
+        "description": "2.5B lightweight, fast. Fits almost any GPU.",
+        "quality": 75,
+        "speed": 95,
+        "released": "2024-10",
+    },
+    {
+        "id": "stabilityai/stable-diffusion-3.5-large",
+        "name": "SD 3.5 Large",
+        "provider": "Stability AI",
+        "params_b": 8.1,
+        "vram_bf16": 22.0,
+        "vram_fp8": 12.0,
+        "vram_q4": None,
+        "default_quant": "BF16",
+        "quant_repos": {
+            "FP8": "Comfy-Org/stable-diffusion-3.5-fp8",
+        },
+        "capabilities": ["text-to-image"],
+        "description": "8B high quality. Good balance of speed and quality.",
+        "quality": 85,
+        "speed": 70,
+        "released": "2024-10",
+    },
+    {
+        "id": "stabilityai/stable-diffusion-3.5-large-turbo",
+        "name": "SD 3.5 Large Turbo",
+        "provider": "Stability AI",
+        "params_b": 8.1,
+        "vram_bf16": 22.0,
+        "vram_fp8": 12.0,
+        "vram_q4": None,
+        "default_quant": "BF16",
+        "quant_repos": {
+            "FP8": "Comfy-Org/stable-diffusion-3.5-fp8",
+        },
+        "capabilities": ["text-to-image"],
+        "description": "Distilled for few-step inference. Fastest large SD.",
+        "quality": 80,
+        "speed": 92,
+        "released": "2024-10",
+    },
+    {
+        "id": "stabilityai/stable-diffusion-xl-base-1.0",
+        "name": "SDXL",
+        "provider": "Stability AI",
+        "params_b": 3.5,
+        "vram_bf16": 10.0,
+        "vram_fp8": 6.0,
+        "vram_q4": None,
+        "default_quant": "BF16",
+        "quant_repos": {},
+        "capabilities": ["text-to-image"],
+        "description": "Classic workhorse. Huge LoRA ecosystem. Fits 8GB+.",
+        "quality": 72,
+        "speed": 90,
+        "released": "2023-07",
+    },
+    # ── Hunyuan ──
+    {
+        "id": "tencent/HunyuanImage-3.0",
+        "name": "HunyuanImage 3.0",
+        "provider": "Tencent",
+        "params_b": 13.0,
+        "vram_bf16": 30.0,
+        "vram_fp8": 16.0,
+        "vram_q4": 9.0,
+        "default_quant": "FP8",
+        "quant_repos": {
+            "Q4": "wikeeyang/Hunyuan-Image-30-Qint4",
+            "NF4": "EricRollei/HunyuanImage-3.0-Instruct-NF4",
+        },
+        "capabilities": ["text-to-image", "text-rendering"],
+        "description": "Strong text rendering. Bilingual Chinese/English. 13B activated per token.",
+        "quality": 88,
+        "speed": 60,
+        "released": "2025-09",
+    },
+    {
+        "id": "tencent/HunyuanImage-3.0-Instruct-Distil",
+        "name": "HunyuanImage 3.0 Distil",
+        "provider": "Tencent",
+        "params_b": 13.0,
+        "vram_bf16": 30.0,
+        "vram_fp8": 16.0,
+        "vram_q4": 9.0,
+        "default_quant": "FP8",
+        "quant_repos": {},
+        "capabilities": ["text-to-image", "text-rendering"],
+        "description": "Distilled variant, fewer steps. Faster with comparable quality.",
+        "quality": 85,
+        "speed": 80,
+        "released": "2026-01",
+    },
+]
+
+
+def get_image_models():
+    """Return the image model registry."""
+    return IMAGE_MODEL_REGISTRY
+
+
+def rank_image_models(system, search=None, sort="fit"):
+    """Score and rank image models against detected hardware.
+
+    Returns list of models with fit info (vram needed, fits, recommended quant).
+    """
+    gpu_vram = system.get("gpu_vram_gb", 0) or 0
+    has_gpu = system.get("has_gpu", False)
+    results = []
+
+    for model in IMAGE_MODEL_REGISTRY:
+        # Filter by search
+        if search:
+            s = search.lower()
+            if s not in model["name"].lower() and s not in model["id"].lower() and s not in model.get("description", "").lower():
+                continue
+
+        # Determine best quant that fits
+        quant = None
+        vram_needed = None
+        fits = False
+        quant_repo = None
+
+        if has_gpu and gpu_vram > 0:
+            # Try BF16 first, then FP8, then Q4
+            for q, vram_key in [("BF16", "vram_bf16"), ("FP8", "vram_fp8"), ("Q4", "vram_q4")]:
+                v = model.get(vram_key)
+                if v is not None and v <= gpu_vram * 0.90:  # 10% headroom
+                    quant = q
+                    vram_needed = v
+                    fits = True
+                    quant_repo = model.get("quant_repos", {}).get(q)
+                    break
+            # If nothing fits, show what it needs
+            if not fits:
+                quant = model["default_quant"]
+                vram_needed = model.get("vram_bf16", 0)
+
+        # Fit label
+        if not has_gpu:
+            fit = "no_gpu"
+            fit_label = "No GPU"
+        elif fits:
+            headroom = gpu_vram - vram_needed
+            if headroom > gpu_vram * 0.3:
+                fit = "perfect"
+                fit_label = "Perfect"
+            elif headroom > gpu_vram * 0.1:
+                fit = "good"
+                fit_label = "Good"
+            else:
+                fit = "tight"
+                fit_label = "Tight"
+        else:
+            fit = "no_fit"
+            fit_label = "Too large"
+
+        # Score: quality * speed * fit bonus
+        score = model["quality"] * 0.6 + model["speed"] * 0.2
+        if fit == "perfect":
+            score += 20
+        elif fit == "good":
+            score += 10
+        elif fit == "tight":
+            score += 5
+        elif fit == "no_fit":
+            score -= 30
+
+        results.append({
+            "id": model["id"],
+            "name": model["name"],
+            "provider": model["provider"],
+            "params_b": model["params_b"],
+            "vram_needed": vram_needed,
+            "quant": quant,
+            "quant_repo": quant_repo,
+            "fits": fits,
+            "fit": fit,
+            "fit_label": fit_label,
+            "quality": model["quality"],
+            "speed": model["speed"],
+            "score": round(score, 1),
+            "capabilities": model["capabilities"],
+            "description": model["description"],
+            "released": model.get("released", ""),
+        })
+
+    # Sort
+    if sort == "quality":
+        results.sort(key=lambda x: (-x["quality"], -x["score"]))
+    elif sort == "speed":
+        results.sort(key=lambda x: (-x["speed"], -x["score"]))
+    elif sort == "vram":
+        results.sort(key=lambda x: (x["vram_needed"] or 999, -x["score"]))
+    else:  # fit (default)
+        results.sort(key=lambda x: (-x["score"],))
+
+    return results
@@ -0,0 +1,177 @@
+import json
+import os
+import re
+
+QUANT_HIERARCHY = ["Q8_0", "Q6_K", "Q5_K_M", "Q4_K_M", "Q3_K_M", "Q2_K"]
+
+QUANT_BPP = {
+    "F32": 4.0, "F16": 2.0, "BF16": 2.0, "FP8": 1.0,
+    "Q8_0": 1.05, "Q6_K": 0.80, "Q5_K_M": 0.68,
+    "Q4_K_M": 0.58, "Q4_0": 0.58, "Q3_K_M": 0.48, "Q2_K": 0.37,
+    "AWQ-4bit": 0.50, "AWQ-8bit": 1.0,
+    "GPTQ-Int4": 0.50, "GPTQ-Int8": 1.0,
+    "mlx-4bit": 0.55, "mlx-8bit": 1.0, "mlx-6bit": 0.75,
+}
+
+QUANT_SPEED_MULT = {
+    "F16": 0.6, "BF16": 0.6, "FP8": 0.85,
+    "Q8_0": 0.8, "Q6_K": 0.95, "Q5_K_M": 1.0,
+    "Q4_K_M": 1.15, "Q4_0": 1.15, "Q3_K_M": 1.25, "Q2_K": 1.35,
+    "AWQ-4bit": 1.2, "AWQ-8bit": 0.85,
+    "GPTQ-Int4": 1.2, "GPTQ-Int8": 0.85,
+    "mlx-4bit": 1.15, "mlx-8bit": 0.85, "mlx-6bit": 1.0,
+}
+
+QUANT_QUALITY_PENALTY = {
+    "F16": 0.0, "BF16": 0.0, "FP8": 0.0,
+    "Q8_0": 0.0, "Q6_K": -1.0, "Q5_K_M": -2.0,
+    "Q4_K_M": -5.0, "Q4_0": -5.0, "Q3_K_M": -8.0, "Q2_K": -12.0,
+    "AWQ-4bit": -3.0, "AWQ-8bit": 0.0,
+    "GPTQ-Int4": -3.0, "GPTQ-Int8": 0.0,
+    "mlx-4bit": -4.0, "mlx-8bit": 0.0, "mlx-6bit": -1.0,
+}
+
+QUANT_BYTES_PER_PARAM = {
+    "F16": 2.0, "BF16": 2.0, "FP8": 1.0,
+    "Q8_0": 1.0, "Q6_K": 0.75, "Q5_K_M": 0.625,
+    "Q4_K_M": 0.5, "Q4_0": 0.5, "Q3_K_M": 0.375, "Q2_K": 0.25,
+    "AWQ-4bit": 0.5, "AWQ-8bit": 1.0,
+    "GPTQ-Int4": 0.5, "GPTQ-Int8": 1.0,
+    "mlx-4bit": 0.5, "mlx-8bit": 1.0, "mlx-6bit": 0.75,
+}
+
+# Pre-quantized formats that should NOT go through the GGUF quant hierarchy
+PREQUANTIZED_PREFIXES = ("AWQ-", "GPTQ-", "mlx-", "FP8")
+
+
+def is_prequantized(model):
+    q = model.get("quantization", "")
+    return any(q.startswith(p) for p in PREQUANTIZED_PREFIXES)
+
+
+def params_b(model):
+    raw = model.get("parameters_raw")
+    if raw and raw > 0:
+        return raw / 1_000_000_000.0
+
+    pc = model.get("parameter_count", "")
+    if pc:
+        pc = pc.strip().upper()
+        m = re.match(r"^([\d.]+)\s*([BKMGT]?)$", pc)
+        if m:
+            val = float(m.group(1))
+            suffix = m.group(2)
+            if suffix == "B":
+                return val
+            elif suffix == "M":
+                return val / 1000.0
+            elif suffix == "K":
+                return val / 1_000_000.0
+            elif suffix == "T":
+                return val * 1000.0
+            else:
+                # No unit. A bare number this size is conventionally a millions
+                # count (e.g. "355" = 355M), NOT billions — otherwise a 355M
+                # model would sort as 355B and leap above every 7B/70B model.
+                # A genuine billions figure carries a "B" suffix and is handled
+                # above; very large bare values are raw parameter counts.
+                if val >= 1_000_000:
+                    return val / 1_000_000_000.0  # raw count
+                if val >= 1000:
+                    return val / 1000.0           # thousands of millions? treat as millions
+                return val / 1000.0               # e.g. "355" → 0.355B
+    return 0.0
+
+
+def estimate_memory_gb(model, quant, ctx):
+    """Estimate VRAM needed to serve a model. All weights must be loaded,
+    even for MoE (all experts live in memory, only active ones compute per token).
+    KV cache scales with active params for MoE (only active experts have KV state)."""
+    pb = params_b(model)
+    bpp = QUANT_BPP.get(quant, 0.58)
+    kv_params = _active_params_b(model)
+    return pb * bpp + 0.000008 * kv_params * ctx + 0.5
+
+
+def _active_params_b(model):
+    """For MoE: active params per token (affects KV cache and speed, not total VRAM).
+    For dense: same as total params."""
+    if model.get("is_moe") and model.get("active_parameters"):
+        return model["active_parameters"] / 1_000_000_000.0
+    return params_b(model)
+
+
+def best_quant_for_budget(model, budget_gb, ctx):
+    """Find best quant that fits in budget_gb of VRAM.
+    Pre-quantized models (AWQ/GPTQ/MLX) use their native quant only.
+    Returns (quant, ctx, mem_gb) or (None, None, None).
+    """
+    if is_prequantized(model):
+        q = model.get("quantization", "Q4_K_M")
+        mem = estimate_memory_gb(model, q, ctx)
+        if mem <= budget_gb:
+            return q, ctx, mem
+        # Try halving context
+        cur_ctx = ctx // 2
+        while cur_ctx >= 1024:
+            mem = estimate_memory_gb(model, q, cur_ctx)
+            if mem <= budget_gb:
+                return q, cur_ctx, mem
+            cur_ctx //= 2
+        return None, None, None
+
+    # GGUF: try best quality first, then fall back
+    for q in QUANT_HIERARCHY:
+        mem = estimate_memory_gb(model, q, ctx)
+        if mem <= budget_gb:
+            return q, ctx, mem
+
+    cur_ctx = ctx // 2
+    while cur_ctx >= 1024:
+        for q in QUANT_HIERARCHY:
+            mem = estimate_memory_gb(model, q, cur_ctx)
+            if mem <= budget_gb:
+                return q, cur_ctx, mem
+        cur_ctx //= 2
+
+    return None, None, None
+
+
+def infer_use_case(model):
+    name = model.get("name", "").lower()
+    uc = model.get("use_case", "").lower()
+    combined = name + " " + uc
+
+    if any(k in combined for k in ("embedding", "embed", "bge")):
+        return "embedding"
+    if any(k in combined for k in ("tts", "text-to-speech", "speech-synthesis", "cosyvoice", "parler")):
+        return "tts"
+    if any(k in combined for k in ("stt", "speech-to-text", "whisper", "transcri", "asr")):
+        return "stt"
+    if "code" in combined:
+        return "coding"
+    if any(k in combined for k in ("vision", "multimodal", "vlm", "vl-")):
+        return "multimodal"
+    if any(k in combined for k in ("reason", "chain-of-thought", "deepseek-r1")):
+        return "reasoning"
+    if any(k in combined for k in ("chat", "instruction")):
+        return "chat"
+    return "general"
+
+
+_models_cache = None
+
+def get_models():
+    global _models_cache
+    if _models_cache is None:
+        data_path = os.path.join(os.path.dirname(__file__), "data", "hf_models.json")
+        try:
+            with open(data_path) as f:
+                _models_cache = json.load(f)
+        except (FileNotFoundError, json.JSONDecodeError):
+            _models_cache = []
+    return _models_cache
+
+
+def model_catalog_path():
+    return os.path.join(os.path.dirname(__file__), "data", "hf_models.json")