mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-30 00:22:10 -04:00
Cookbook fit: steer consumer AMD to GGUF recommendations
* Cookbook fit: consumer-AMD GGUF recommendations + accurate estimates (core logic) Split of #746 — the estimate/ranking MATH only, so it can be reviewed with tests first (UI changes follow separately). Backend files only: no static/js here. services/hwfit/fit.py, services/hwfit/hardware.py: - Recommend GGUF/llama.cpp on consumer AMD (RDNA, gfx10/11/12) instead of formats that don't run on consumer Radeon — vLLM-only AWQ/GPTQ/FP8 AND vendor-specific NVFP4 (NVIDIA) / MLX (Apple). Datacenter Instinct (CDNA) and CUDA are left untouched. - More accurate speed estimates across more GPUs (adds RDNA bandwidth data). - Detect AMD/RDNA GPUs (gpu_family from rocminfo) so fit/serve can branch on it. tests/test_hwfit_amd.py: AMD recommendation path, quant/bit matching, estimate realism, gfx RDNA-vs-CDNA classification. Rebased onto current main (analyze_model gained a scoring_use_case param there; kept it). Vision detection intentionally NOT added here — main already ships a "Vision" type filter + multimodal use-case handling; duplicating it was dropped. Checks: py_compile clean; pytest tests/test_hwfit_amd.py + hwfit/serve suites = 28 passed; full suite 0 new failures vs main. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> * Tests: assert NVFP4/MLX/FP8 formats are filtered on consumer RDNA Backs the #972 claim with an explicit regression: no NVIDIA NVFP4, Apple MLX, or vLLM-only FP8/AWQ/GPTQ repos are recommended on a consumer Radeon, and guards against vacuity by asserting such repos exist in the catalog. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
+57
-12
@@ -18,7 +18,7 @@ GPU_BANDWIDTH = {
|
||||
"7900 xtx": 960, "7900 xt": 800, "7900 gre": 576, "7800 xt": 624, "7700 xt": 432, "7600": 288,
|
||||
"6950 xt": 576, "6900 xt": 512, "6800 xt": 512, "6800": 512, "6700 xt": 384, "6600 xt": 256, "6600": 224,
|
||||
"mi300x": 5300, "mi300": 5300, "mi250x": 3277, "mi250": 3277, "mi210": 1638, "mi100": 1229,
|
||||
"9070 xt": 624, "9070": 488,
|
||||
"9070 xt": 624, "9070": 488, "9060 xt": 322, "9060": 322,
|
||||
# Apple Silicon unified-memory bandwidth (GB/s). Keyed off the chip name
|
||||
# reported by sysctl machdep.cpu.brand_string (e.g. "Apple M4 Max"). Listed
|
||||
# before the bare "m_" keys matters less than length-sorting (done below),
|
||||
@@ -70,8 +70,18 @@ def _lookup_bandwidth(gpu_name):
|
||||
return None
|
||||
|
||||
|
||||
def _estimate_speed(model, quant, run_mode, system):
|
||||
"""Estimate tok/s. Uses active params for MoE (only active experts run per token)."""
|
||||
def _estimate_speed(model, quant, run_mode, system, offload_frac=0.0):
|
||||
"""Estimate tok/s. Uses active params for MoE (only active experts run per token).
|
||||
|
||||
offload_frac (0..1): fraction of the model's weights that spill to system RAM
|
||||
(CPU) because they don't fit VRAM. Generation reads every active weight per
|
||||
token, so when part lives in CPU RAM the per-token time is dominated by the
|
||||
slow path. We model effective bandwidth as a blend of GPU VRAM bandwidth and
|
||||
system-RAM bandwidth weighted by what's where — far more accurate than a flat
|
||||
"halve it" for partial offload, which under/over-shoots depending on amount.
|
||||
Calibrated against a measured RX 9060 XT: DeepSeek-Coder-V2-Lite Q4_K_M with
|
||||
light offload → ~59 t/s est vs 59.8 measured.
|
||||
"""
|
||||
pb = _active_params_b(model)
|
||||
is_moe = model.get("is_moe", False)
|
||||
bw = _lookup_bandwidth(system.get("gpu_name"))
|
||||
@@ -83,14 +93,24 @@ def _estimate_speed(model, quant, run_mode, system):
|
||||
if model_gb <= 0:
|
||||
return 0.0
|
||||
efficiency = 0.55
|
||||
raw_tps = (bw / model_gb) * efficiency
|
||||
if run_mode == "cpu_offload":
|
||||
mode_factor = 0.5
|
||||
elif is_moe:
|
||||
mode_factor = 0.8
|
||||
else:
|
||||
mode_factor = 1.0
|
||||
return raw_tps * mode_factor
|
||||
# Dual-channel DDR4-3200 ≈ 50 GB/s; DDR5 systems higher, but be
|
||||
# conservative since offloaded MoE is also compute-bound on CPU.
|
||||
cpu_bw = 55.0
|
||||
frac = min(max(offload_frac, 0.0), 1.0)
|
||||
# If we don't know the fraction (legacy callers pass 0 with
|
||||
# cpu_offload), assume a meaningful spill so we don't overestimate.
|
||||
if frac <= 0.0:
|
||||
frac = 0.5
|
||||
# Harmonic-style blend: time = frac/cpu_bw + (1-frac)/gpu_bw, so the
|
||||
# slow CPU portion dominates as it grows (matches the steep real-world
|
||||
# drop-off when more experts offload).
|
||||
eff_bw = 1.0 / (frac / cpu_bw + (1.0 - frac) / bw)
|
||||
raw_tps = (eff_bw / model_gb) * efficiency
|
||||
return raw_tps * (0.8 if is_moe else 1.0)
|
||||
# Fully on GPU.
|
||||
raw_tps = (bw / model_gb) * efficiency
|
||||
return raw_tps * (0.8 if is_moe else 1.0)
|
||||
|
||||
k = FALLBACK_K.get(backend, 70)
|
||||
if pb <= 0:
|
||||
@@ -357,7 +377,12 @@ def analyze_model(model, system, target_quant=None, scoring_use_case=None):
|
||||
else:
|
||||
fit_level = "marginal"
|
||||
|
||||
tps = _estimate_speed(model, quant, run_mode, system)
|
||||
# Fraction of the model that spills to CPU RAM (drives the offload speed
|
||||
# model). When offloading, anything beyond the GPU's VRAM lives in system RAM.
|
||||
offload_frac = 0.0
|
||||
if run_mode == "cpu_offload" and required_gb > 0 and effective_vram > 0:
|
||||
offload_frac = max(0.0, (required_gb - effective_vram) / required_gb)
|
||||
tps = _estimate_speed(model, quant, run_mode, system, offload_frac=offload_frac)
|
||||
|
||||
q_score = _quality_score(model, quant, score_use_case)
|
||||
s_score = _speed_score(tps, score_use_case)
|
||||
@@ -389,6 +414,7 @@ def analyze_model(model, system, target_quant=None, scoring_use_case=None):
|
||||
},
|
||||
"gguf_sources": model.get("gguf_sources", []),
|
||||
"context_length": model.get("context_length", 4096),
|
||||
"release_date": model.get("release_date", ""),
|
||||
}
|
||||
|
||||
|
||||
@@ -398,6 +424,10 @@ SORT_KEYS = {
|
||||
"vram": lambda r: r["required_gb"],
|
||||
"params": lambda r: r["params_b"],
|
||||
"context": lambda r: r["context"],
|
||||
# Newest first. release_date is an ISO-ish string ("2026-05-30"); plain
|
||||
# string sort is chronological. Missing dates sort last (empty < any date,
|
||||
# and we sort reverse=True for newest, so "" lands at the bottom).
|
||||
"newest": lambda r: r.get("release_date") or "",
|
||||
}
|
||||
|
||||
|
||||
@@ -454,6 +484,16 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
|
||||
apple_silicon = system_backend in ("mps", "metal", "apple")
|
||||
rocm = system_backend == "rocm"
|
||||
|
||||
# Consumer AMD Radeon (RDNA, gfx10/11/12): the practical local serving path
|
||||
# is GGUF via llama.cpp. vLLM/SGLang on ROCm are validated for datacenter
|
||||
# Instinct (CDNA, gfx9xx) but are unreliable on consumer RDNA — AWQ kernels
|
||||
# are largely unsupported there and FP8 needs out-of-tree patches. So treat
|
||||
# consumer RDNA like Apple Silicon (GGUF-only) and leave CDNA untouched.
|
||||
# Unknown family (no rocminfo) is left untouched to avoid hiding models from
|
||||
# a possibly-capable Instinct box on a misdetect.
|
||||
gpu_family = (system.get("gpu_family") or "").lower()
|
||||
consumer_amd = system_backend == "rocm" and gpu_family == "rdna"
|
||||
|
||||
for m in models:
|
||||
native_q = m.get("quantization", "")
|
||||
if "nvfp4" in (m.get("name") or "").lower():
|
||||
@@ -479,7 +519,12 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
|
||||
# default GGUF quant) and vLLM-only AWQ/GPTQ/FP8 builds alike. Without
|
||||
# this the Cookbook recommends models the Mac can't run; on CUDA these
|
||||
# stay visible because vLLM serves safetensors directly.
|
||||
if apple_silicon and not (m.get("is_gguf") or m.get("gguf_sources")):
|
||||
#
|
||||
# Consumer AMD (RDNA) is the same story: GGUF via llama.cpp is the
|
||||
# servable path, so a model needs a real GGUF to be recommended.
|
||||
# Otherwise the Cookbook rates vLLM-only AWQ/GPTQ builds "GOOD" on a
|
||||
# Radeon that can't actually serve them.
|
||||
if (apple_silicon or consumer_amd) and not (m.get("is_gguf") or m.get("gguf_sources")):
|
||||
continue
|
||||
|
||||
# Format filter: AWQ tab -> only AWQ models, FP4 tab -> FP4-family models, etc.
|
||||
|
||||
Reference in New Issue
Block a user