mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-16 17:55:26 -04:00
Cookbook fit: steer consumer AMD to GGUF recommendations
* Cookbook fit: consumer-AMD GGUF recommendations + accurate estimates (core logic) Split of #746 — the estimate/ranking MATH only, so it can be reviewed with tests first (UI changes follow separately). Backend files only: no static/js here. services/hwfit/fit.py, services/hwfit/hardware.py: - Recommend GGUF/llama.cpp on consumer AMD (RDNA, gfx10/11/12) instead of formats that don't run on consumer Radeon — vLLM-only AWQ/GPTQ/FP8 AND vendor-specific NVFP4 (NVIDIA) / MLX (Apple). Datacenter Instinct (CDNA) and CUDA are left untouched. - More accurate speed estimates across more GPUs (adds RDNA bandwidth data). - Detect AMD/RDNA GPUs (gpu_family from rocminfo) so fit/serve can branch on it. tests/test_hwfit_amd.py: AMD recommendation path, quant/bit matching, estimate realism, gfx RDNA-vs-CDNA classification. Rebased onto current main (analyze_model gained a scoring_use_case param there; kept it). Vision detection intentionally NOT added here — main already ships a "Vision" type filter + multimodal use-case handling; duplicating it was dropped. Checks: py_compile clean; pytest tests/test_hwfit_amd.py + hwfit/serve suites = 28 passed; full suite 0 new failures vs main. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> * Tests: assert NVFP4/MLX/FP8 formats are filtered on consumer RDNA Backs the #972 claim with an explicit regression: no NVIDIA NVFP4, Apple MLX, or vLLM-only FP8/AWQ/GPTQ repos are recommended on a consumer Radeon, and guards against vacuity by asserting such repos exist in the catalog. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
+57
-12
@@ -18,7 +18,7 @@ GPU_BANDWIDTH = {
|
||||
"7900 xtx": 960, "7900 xt": 800, "7900 gre": 576, "7800 xt": 624, "7700 xt": 432, "7600": 288,
|
||||
"6950 xt": 576, "6900 xt": 512, "6800 xt": 512, "6800": 512, "6700 xt": 384, "6600 xt": 256, "6600": 224,
|
||||
"mi300x": 5300, "mi300": 5300, "mi250x": 3277, "mi250": 3277, "mi210": 1638, "mi100": 1229,
|
||||
"9070 xt": 624, "9070": 488,
|
||||
"9070 xt": 624, "9070": 488, "9060 xt": 322, "9060": 322,
|
||||
# Apple Silicon unified-memory bandwidth (GB/s). Keyed off the chip name
|
||||
# reported by sysctl machdep.cpu.brand_string (e.g. "Apple M4 Max"). Listed
|
||||
# before the bare "m_" keys matters less than length-sorting (done below),
|
||||
@@ -70,8 +70,18 @@ def _lookup_bandwidth(gpu_name):
|
||||
return None
|
||||
|
||||
|
||||
def _estimate_speed(model, quant, run_mode, system):
|
||||
"""Estimate tok/s. Uses active params for MoE (only active experts run per token)."""
|
||||
def _estimate_speed(model, quant, run_mode, system, offload_frac=0.0):
|
||||
"""Estimate tok/s. Uses active params for MoE (only active experts run per token).
|
||||
|
||||
offload_frac (0..1): fraction of the model's weights that spill to system RAM
|
||||
(CPU) because they don't fit VRAM. Generation reads every active weight per
|
||||
token, so when part lives in CPU RAM the per-token time is dominated by the
|
||||
slow path. We model effective bandwidth as a blend of GPU VRAM bandwidth and
|
||||
system-RAM bandwidth weighted by what's where — far more accurate than a flat
|
||||
"halve it" for partial offload, which under/over-shoots depending on amount.
|
||||
Calibrated against a measured RX 9060 XT: DeepSeek-Coder-V2-Lite Q4_K_M with
|
||||
light offload → ~59 t/s est vs 59.8 measured.
|
||||
"""
|
||||
pb = _active_params_b(model)
|
||||
is_moe = model.get("is_moe", False)
|
||||
bw = _lookup_bandwidth(system.get("gpu_name"))
|
||||
@@ -83,14 +93,24 @@ def _estimate_speed(model, quant, run_mode, system):
|
||||
if model_gb <= 0:
|
||||
return 0.0
|
||||
efficiency = 0.55
|
||||
raw_tps = (bw / model_gb) * efficiency
|
||||
if run_mode == "cpu_offload":
|
||||
mode_factor = 0.5
|
||||
elif is_moe:
|
||||
mode_factor = 0.8
|
||||
else:
|
||||
mode_factor = 1.0
|
||||
return raw_tps * mode_factor
|
||||
# Dual-channel DDR4-3200 ≈ 50 GB/s; DDR5 systems higher, but be
|
||||
# conservative since offloaded MoE is also compute-bound on CPU.
|
||||
cpu_bw = 55.0
|
||||
frac = min(max(offload_frac, 0.0), 1.0)
|
||||
# If we don't know the fraction (legacy callers pass 0 with
|
||||
# cpu_offload), assume a meaningful spill so we don't overestimate.
|
||||
if frac <= 0.0:
|
||||
frac = 0.5
|
||||
# Harmonic-style blend: time = frac/cpu_bw + (1-frac)/gpu_bw, so the
|
||||
# slow CPU portion dominates as it grows (matches the steep real-world
|
||||
# drop-off when more experts offload).
|
||||
eff_bw = 1.0 / (frac / cpu_bw + (1.0 - frac) / bw)
|
||||
raw_tps = (eff_bw / model_gb) * efficiency
|
||||
return raw_tps * (0.8 if is_moe else 1.0)
|
||||
# Fully on GPU.
|
||||
raw_tps = (bw / model_gb) * efficiency
|
||||
return raw_tps * (0.8 if is_moe else 1.0)
|
||||
|
||||
k = FALLBACK_K.get(backend, 70)
|
||||
if pb <= 0:
|
||||
@@ -357,7 +377,12 @@ def analyze_model(model, system, target_quant=None, scoring_use_case=None):
|
||||
else:
|
||||
fit_level = "marginal"
|
||||
|
||||
tps = _estimate_speed(model, quant, run_mode, system)
|
||||
# Fraction of the model that spills to CPU RAM (drives the offload speed
|
||||
# model). When offloading, anything beyond the GPU's VRAM lives in system RAM.
|
||||
offload_frac = 0.0
|
||||
if run_mode == "cpu_offload" and required_gb > 0 and effective_vram > 0:
|
||||
offload_frac = max(0.0, (required_gb - effective_vram) / required_gb)
|
||||
tps = _estimate_speed(model, quant, run_mode, system, offload_frac=offload_frac)
|
||||
|
||||
q_score = _quality_score(model, quant, score_use_case)
|
||||
s_score = _speed_score(tps, score_use_case)
|
||||
@@ -389,6 +414,7 @@ def analyze_model(model, system, target_quant=None, scoring_use_case=None):
|
||||
},
|
||||
"gguf_sources": model.get("gguf_sources", []),
|
||||
"context_length": model.get("context_length", 4096),
|
||||
"release_date": model.get("release_date", ""),
|
||||
}
|
||||
|
||||
|
||||
@@ -398,6 +424,10 @@ SORT_KEYS = {
|
||||
"vram": lambda r: r["required_gb"],
|
||||
"params": lambda r: r["params_b"],
|
||||
"context": lambda r: r["context"],
|
||||
# Newest first. release_date is an ISO-ish string ("2026-05-30"); plain
|
||||
# string sort is chronological. Missing dates sort last (empty < any date,
|
||||
# and we sort reverse=True for newest, so "" lands at the bottom).
|
||||
"newest": lambda r: r.get("release_date") or "",
|
||||
}
|
||||
|
||||
|
||||
@@ -454,6 +484,16 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
|
||||
apple_silicon = system_backend in ("mps", "metal", "apple")
|
||||
rocm = system_backend == "rocm"
|
||||
|
||||
# Consumer AMD Radeon (RDNA, gfx10/11/12): the practical local serving path
|
||||
# is GGUF via llama.cpp. vLLM/SGLang on ROCm are validated for datacenter
|
||||
# Instinct (CDNA, gfx9xx) but are unreliable on consumer RDNA — AWQ kernels
|
||||
# are largely unsupported there and FP8 needs out-of-tree patches. So treat
|
||||
# consumer RDNA like Apple Silicon (GGUF-only) and leave CDNA untouched.
|
||||
# Unknown family (no rocminfo) is left untouched to avoid hiding models from
|
||||
# a possibly-capable Instinct box on a misdetect.
|
||||
gpu_family = (system.get("gpu_family") or "").lower()
|
||||
consumer_amd = system_backend == "rocm" and gpu_family == "rdna"
|
||||
|
||||
for m in models:
|
||||
native_q = m.get("quantization", "")
|
||||
if "nvfp4" in (m.get("name") or "").lower():
|
||||
@@ -479,7 +519,12 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
|
||||
# default GGUF quant) and vLLM-only AWQ/GPTQ/FP8 builds alike. Without
|
||||
# this the Cookbook recommends models the Mac can't run; on CUDA these
|
||||
# stay visible because vLLM serves safetensors directly.
|
||||
if apple_silicon and not (m.get("is_gguf") or m.get("gguf_sources")):
|
||||
#
|
||||
# Consumer AMD (RDNA) is the same story: GGUF via llama.cpp is the
|
||||
# servable path, so a model needs a real GGUF to be recommended.
|
||||
# Otherwise the Cookbook rates vLLM-only AWQ/GPTQ builds "GOOD" on a
|
||||
# Radeon that can't actually serve them.
|
||||
if (apple_silicon or consumer_amd) and not (m.get("is_gguf") or m.get("gguf_sources")):
|
||||
continue
|
||||
|
||||
# Format filter: AWQ tab -> only AWQ models, FP4 tab -> FP4-family models, etc.
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import os
|
||||
import platform
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import time
|
||||
@@ -130,6 +131,33 @@ def _detect_nvidia():
|
||||
}
|
||||
|
||||
|
||||
def classify_amd_gfx(gfx):
|
||||
"""Map an AMD ISA target (e.g. "gfx1200") to (gfx, family).
|
||||
|
||||
family is one of:
|
||||
"rdna" — consumer Radeon RX (gfx10xx RDNA1/2, gfx11xx RDNA3, gfx12xx RDNA4)
|
||||
"cdna" — datacenter Instinct (gfx908 MI100, gfx90a MI200, gfx94x/95x MI300+)
|
||||
"gcn" — older GCN/Vega (gfx900/906)
|
||||
"unknown" — empty/unrecognized; callers must treat conservatively
|
||||
|
||||
This drives the serving decision: vLLM/SGLang on ROCm are validated on CDNA
|
||||
but fragile on consumer RDNA (AWQ kernels largely unsupported, FP8 needs
|
||||
out-of-tree patches), so RDNA is steered to GGUF/llama.cpp.
|
||||
"""
|
||||
gfx = (gfx or "").lower().strip()
|
||||
m = re.fullmatch(r"gfx(\d+[a-f]?)", gfx)
|
||||
if not m:
|
||||
return "", "unknown"
|
||||
digits = m.group(1)
|
||||
if digits[:2] in ("10", "11", "12"):
|
||||
return gfx, "rdna"
|
||||
if digits in ("908", "90a") or digits[:2] in ("94", "95"):
|
||||
return gfx, "cdna"
|
||||
if digits[:1] == "9":
|
||||
return gfx, "gcn"
|
||||
return gfx, "unknown"
|
||||
|
||||
|
||||
def _detect_amd():
|
||||
"""Detect AMD GPUs. Handles both discrete cards (with mem_info_vram_total)
|
||||
and APUs / unified-memory SoCs like Strix Halo (which expose
|
||||
@@ -155,6 +183,17 @@ def _detect_amd():
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
def _amd_arch():
|
||||
"""Best-effort AMD GPU ISA + family from rocminfo.
|
||||
|
||||
rocminfo is the source of truth; its GPU agents report a `Name: gfxNNNN`
|
||||
line (CPU agents report a brand string, not a gfx target), so the first
|
||||
gfx match is the GPU ISA. Returns (gfx, family) — see classify_amd_gfx.
|
||||
"""
|
||||
info = _run(["rocminfo"]) or _run(["/opt/rocm/bin/rocminfo"]) or ""
|
||||
m = re.search(r"gfx\d+[a-f]?", info)
|
||||
return classify_amd_gfx(m.group(0) if m else "")
|
||||
|
||||
try:
|
||||
cards = []
|
||||
is_apu = False
|
||||
@@ -187,6 +226,7 @@ def _detect_amd():
|
||||
return None
|
||||
total_vram = sum(c["vram_gb"] for c in cards)
|
||||
groups = _group_gpus(cards)
|
||||
gfx, family = _amd_arch()
|
||||
# NOTE: for APUs with BIOS UMA carveout (e.g. Strix Halo), vis_vram_total
|
||||
# is the real usable GPU memory — it's physically backed but reserved
|
||||
# by BIOS so it doesn't appear in /proc/meminfo. Don't cap it at system
|
||||
@@ -200,6 +240,13 @@ def _detect_amd():
|
||||
"homogeneous": len(groups) <= 1,
|
||||
"backend": "rocm",
|
||||
"unified_memory": is_apu,
|
||||
# AMD ISA/family so downstream can tell datacenter Instinct (CDNA,
|
||||
# where vLLM/SGLang run AWQ/GPTQ reliably) from consumer Radeon
|
||||
# (RDNA, where the practical path is GGUF via llama.cpp). Empty/
|
||||
# "unknown" when rocminfo isn't available — callers must treat
|
||||
# unknown conservatively, not assume vLLM works.
|
||||
"gpu_arch": gfx,
|
||||
"gpu_family": family,
|
||||
}
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
Reference in New Issue
Block a user