Cookbook fit: steer consumer AMD to GGUF recommendations

* Cookbook fit: consumer-AMD GGUF recommendations + accurate estimates (core logic)

Split of #746 — the estimate/ranking MATH only, so it can be reviewed with tests
first (UI changes follow separately). Backend files only: no static/js here.

services/hwfit/fit.py, services/hwfit/hardware.py:
- Recommend GGUF/llama.cpp on consumer AMD (RDNA, gfx10/11/12) instead of
  formats that don't run on consumer Radeon — vLLM-only AWQ/GPTQ/FP8 AND
  vendor-specific NVFP4 (NVIDIA) / MLX (Apple). Datacenter Instinct (CDNA) and
  CUDA are left untouched.
- More accurate speed estimates across more GPUs (adds RDNA bandwidth data).
- Detect AMD/RDNA GPUs (gpu_family from rocminfo) so fit/serve can branch on it.

tests/test_hwfit_amd.py: AMD recommendation path, quant/bit matching, estimate
realism, gfx RDNA-vs-CDNA classification.

Rebased onto current main (analyze_model gained a scoring_use_case param there;
kept it). Vision detection intentionally NOT added here — main already ships a
"Vision" type filter + multimodal use-case handling; duplicating it was dropped.

Checks: py_compile clean; pytest tests/test_hwfit_amd.py + hwfit/serve suites
= 28 passed; full suite 0 new failures vs main.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

* Tests: assert NVFP4/MLX/FP8 formats are filtered on consumer RDNA

Backs the #972 claim with an explicit regression: no NVIDIA NVFP4, Apple MLX,
or vLLM-only FP8/AWQ/GPTQ repos are recommended on a consumer Radeon, and guards
against vacuity by asserting such repos exist in the catalog.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Leo
2026-06-02 14:01:42 +02:00
committed by GitHub
parent fd89d098a1
commit de92bbe47a
3 changed files with 299 additions and 12 deletions
+57 -12
View File
@@ -18,7 +18,7 @@ GPU_BANDWIDTH = {
"7900 xtx": 960, "7900 xt": 800, "7900 gre": 576, "7800 xt": 624, "7700 xt": 432, "7600": 288,
"6950 xt": 576, "6900 xt": 512, "6800 xt": 512, "6800": 512, "6700 xt": 384, "6600 xt": 256, "6600": 224,
"mi300x": 5300, "mi300": 5300, "mi250x": 3277, "mi250": 3277, "mi210": 1638, "mi100": 1229,
"9070 xt": 624, "9070": 488,
"9070 xt": 624, "9070": 488, "9060 xt": 322, "9060": 322,
# Apple Silicon unified-memory bandwidth (GB/s). Keyed off the chip name
# reported by sysctl machdep.cpu.brand_string (e.g. "Apple M4 Max"). Listed
# before the bare "m_" keys matters less than length-sorting (done below),
@@ -70,8 +70,18 @@ def _lookup_bandwidth(gpu_name):
return None
def _estimate_speed(model, quant, run_mode, system):
"""Estimate tok/s. Uses active params for MoE (only active experts run per token)."""
def _estimate_speed(model, quant, run_mode, system, offload_frac=0.0):
"""Estimate tok/s. Uses active params for MoE (only active experts run per token).
offload_frac (0..1): fraction of the model's weights that spill to system RAM
(CPU) because they don't fit VRAM. Generation reads every active weight per
token, so when part lives in CPU RAM the per-token time is dominated by the
slow path. We model effective bandwidth as a blend of GPU VRAM bandwidth and
system-RAM bandwidth weighted by what's where — far more accurate than a flat
"halve it" for partial offload, which under/over-shoots depending on amount.
Calibrated against a measured RX 9060 XT: DeepSeek-Coder-V2-Lite Q4_K_M with
light offload → ~59 t/s est vs 59.8 measured.
"""
pb = _active_params_b(model)
is_moe = model.get("is_moe", False)
bw = _lookup_bandwidth(system.get("gpu_name"))
@@ -83,14 +93,24 @@ def _estimate_speed(model, quant, run_mode, system):
if model_gb <= 0:
return 0.0
efficiency = 0.55
raw_tps = (bw / model_gb) * efficiency
if run_mode == "cpu_offload":
mode_factor = 0.5
elif is_moe:
mode_factor = 0.8
else:
mode_factor = 1.0
return raw_tps * mode_factor
# Dual-channel DDR4-3200 ≈ 50 GB/s; DDR5 systems higher, but be
# conservative since offloaded MoE is also compute-bound on CPU.
cpu_bw = 55.0
frac = min(max(offload_frac, 0.0), 1.0)
# If we don't know the fraction (legacy callers pass 0 with
# cpu_offload), assume a meaningful spill so we don't overestimate.
if frac <= 0.0:
frac = 0.5
# Harmonic-style blend: time = frac/cpu_bw + (1-frac)/gpu_bw, so the
# slow CPU portion dominates as it grows (matches the steep real-world
# drop-off when more experts offload).
eff_bw = 1.0 / (frac / cpu_bw + (1.0 - frac) / bw)
raw_tps = (eff_bw / model_gb) * efficiency
return raw_tps * (0.8 if is_moe else 1.0)
# Fully on GPU.
raw_tps = (bw / model_gb) * efficiency
return raw_tps * (0.8 if is_moe else 1.0)
k = FALLBACK_K.get(backend, 70)
if pb <= 0:
@@ -357,7 +377,12 @@ def analyze_model(model, system, target_quant=None, scoring_use_case=None):
else:
fit_level = "marginal"
tps = _estimate_speed(model, quant, run_mode, system)
# Fraction of the model that spills to CPU RAM (drives the offload speed
# model). When offloading, anything beyond the GPU's VRAM lives in system RAM.
offload_frac = 0.0
if run_mode == "cpu_offload" and required_gb > 0 and effective_vram > 0:
offload_frac = max(0.0, (required_gb - effective_vram) / required_gb)
tps = _estimate_speed(model, quant, run_mode, system, offload_frac=offload_frac)
q_score = _quality_score(model, quant, score_use_case)
s_score = _speed_score(tps, score_use_case)
@@ -389,6 +414,7 @@ def analyze_model(model, system, target_quant=None, scoring_use_case=None):
},
"gguf_sources": model.get("gguf_sources", []),
"context_length": model.get("context_length", 4096),
"release_date": model.get("release_date", ""),
}
@@ -398,6 +424,10 @@ SORT_KEYS = {
"vram": lambda r: r["required_gb"],
"params": lambda r: r["params_b"],
"context": lambda r: r["context"],
# Newest first. release_date is an ISO-ish string ("2026-05-30"); plain
# string sort is chronological. Missing dates sort last (empty < any date,
# and we sort reverse=True for newest, so "" lands at the bottom).
"newest": lambda r: r.get("release_date") or "",
}
@@ -454,6 +484,16 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
apple_silicon = system_backend in ("mps", "metal", "apple")
rocm = system_backend == "rocm"
# Consumer AMD Radeon (RDNA, gfx10/11/12): the practical local serving path
# is GGUF via llama.cpp. vLLM/SGLang on ROCm are validated for datacenter
# Instinct (CDNA, gfx9xx) but are unreliable on consumer RDNA — AWQ kernels
# are largely unsupported there and FP8 needs out-of-tree patches. So treat
# consumer RDNA like Apple Silicon (GGUF-only) and leave CDNA untouched.
# Unknown family (no rocminfo) is left untouched to avoid hiding models from
# a possibly-capable Instinct box on a misdetect.
gpu_family = (system.get("gpu_family") or "").lower()
consumer_amd = system_backend == "rocm" and gpu_family == "rdna"
for m in models:
native_q = m.get("quantization", "")
if "nvfp4" in (m.get("name") or "").lower():
@@ -479,7 +519,12 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
# default GGUF quant) and vLLM-only AWQ/GPTQ/FP8 builds alike. Without
# this the Cookbook recommends models the Mac can't run; on CUDA these
# stay visible because vLLM serves safetensors directly.
if apple_silicon and not (m.get("is_gguf") or m.get("gguf_sources")):
#
# Consumer AMD (RDNA) is the same story: GGUF via llama.cpp is the
# servable path, so a model needs a real GGUF to be recommended.
# Otherwise the Cookbook rates vLLM-only AWQ/GPTQ builds "GOOD" on a
# Radeon that can't actually serve them.
if (apple_silicon or consumer_amd) and not (m.get("is_gguf") or m.get("gguf_sources")):
continue
# Format filter: AWQ tab -> only AWQ models, FP4 tab -> FP4-family models, etc.
+47
View File
@@ -1,5 +1,6 @@
import os
import platform
import re
import shutil
import subprocess
import time
@@ -130,6 +131,33 @@ def _detect_nvidia():
}
def classify_amd_gfx(gfx):
"""Map an AMD ISA target (e.g. "gfx1200") to (gfx, family).
family is one of:
"rdna" — consumer Radeon RX (gfx10xx RDNA1/2, gfx11xx RDNA3, gfx12xx RDNA4)
"cdna" — datacenter Instinct (gfx908 MI100, gfx90a MI200, gfx94x/95x MI300+)
"gcn" — older GCN/Vega (gfx900/906)
"unknown" — empty/unrecognized; callers must treat conservatively
This drives the serving decision: vLLM/SGLang on ROCm are validated on CDNA
but fragile on consumer RDNA (AWQ kernels largely unsupported, FP8 needs
out-of-tree patches), so RDNA is steered to GGUF/llama.cpp.
"""
gfx = (gfx or "").lower().strip()
m = re.fullmatch(r"gfx(\d+[a-f]?)", gfx)
if not m:
return "", "unknown"
digits = m.group(1)
if digits[:2] in ("10", "11", "12"):
return gfx, "rdna"
if digits in ("908", "90a") or digits[:2] in ("94", "95"):
return gfx, "cdna"
if digits[:1] == "9":
return gfx, "gcn"
return gfx, "unknown"
def _detect_amd():
"""Detect AMD GPUs. Handles both discrete cards (with mem_info_vram_total)
and APUs / unified-memory SoCs like Strix Halo (which expose
@@ -155,6 +183,17 @@ def _detect_amd():
except Exception:
return []
def _amd_arch():
"""Best-effort AMD GPU ISA + family from rocminfo.
rocminfo is the source of truth; its GPU agents report a `Name: gfxNNNN`
line (CPU agents report a brand string, not a gfx target), so the first
gfx match is the GPU ISA. Returns (gfx, family) — see classify_amd_gfx.
"""
info = _run(["rocminfo"]) or _run(["/opt/rocm/bin/rocminfo"]) or ""
m = re.search(r"gfx\d+[a-f]?", info)
return classify_amd_gfx(m.group(0) if m else "")
try:
cards = []
is_apu = False
@@ -187,6 +226,7 @@ def _detect_amd():
return None
total_vram = sum(c["vram_gb"] for c in cards)
groups = _group_gpus(cards)
gfx, family = _amd_arch()
# NOTE: for APUs with BIOS UMA carveout (e.g. Strix Halo), vis_vram_total
# is the real usable GPU memory — it's physically backed but reserved
# by BIOS so it doesn't appear in /proc/meminfo. Don't cap it at system
@@ -200,6 +240,13 @@ def _detect_amd():
"homogeneous": len(groups) <= 1,
"backend": "rocm",
"unified_memory": is_apu,
# AMD ISA/family so downstream can tell datacenter Instinct (CDNA,
# where vLLM/SGLang run AWQ/GPTQ reliably) from consumer Radeon
# (RDNA, where the practical path is GGUF via llama.cpp). Empty/
# "unknown" when rocminfo isn't available — callers must treat
# unknown conservatively, not assume vLLM works.
"gpu_arch": gfx,
"gpu_family": family,
}
except Exception:
return None