mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-17 10:15:27 -04:00
Merge remote-tracking branch 'origin/main' into visual-pr-playground
# Conflicts: # routes/cookbook_routes.py # routes/hwfit_routes.py # services/hwfit/fit.py # services/hwfit/models.py # static/js/cookbook-diagnosis.js # static/js/cookbook-hwfit.js # static/js/cookbook.js # static/js/cookbookRunning.js
This commit is contained in:
+17
-3
@@ -61,7 +61,7 @@ CONTEXT_TARGET = {
|
||||
|
||||
|
||||
def _lookup_bandwidth(gpu_name):
|
||||
if not gpu_name:
|
||||
if not isinstance(gpu_name, str) or not gpu_name:
|
||||
return None
|
||||
gn = gpu_name.lower()
|
||||
for key in _BW_KEYS_SORTED:
|
||||
@@ -280,10 +280,14 @@ def _native_quant(model):
|
||||
return "FP8"
|
||||
if "gptq" in text:
|
||||
m = re.search(r"(?:gptq|int|w)(?:[-_]?)(\d{1,2})(?:bit)?", text)
|
||||
return f"GPTQ-{m.group(1)}bit" if m else "GPTQ"
|
||||
# Canonical catalog label is "GPTQ-Int4"/"GPTQ-Int8" (see models.py
|
||||
# QUANT_BPP / QUANT_QUALITY_PENALTY keys); "GPTQ-4bit" misses both
|
||||
# maps, so BPP and the quality penalty silently fall to defaults.
|
||||
return f"GPTQ-Int{m.group(1)}" if m else "GPTQ-Int4"
|
||||
if "awq" in text:
|
||||
m = re.search(r"(?:awq|int|w)(?:[-_]?)(\d{1,2})(?:bit)?", text)
|
||||
return f"AWQ-{m.group(1)}bit" if m else "AWQ"
|
||||
# Catalog keys are "AWQ-4bit"/"AWQ-8bit"; bare "AWQ" misses the maps.
|
||||
return f"AWQ-{m.group(1)}bit" if m else "AWQ-4bit"
|
||||
if "mlx" in text:
|
||||
m = re.search(r"mlx[-_]?(\d{1,2})bit", text)
|
||||
return f"mlx-{m.group(1)}bit" if m else native_quant
|
||||
@@ -571,6 +575,8 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
|
||||
|
||||
system_backend = (system.get("backend") or "").lower()
|
||||
apple_silicon = system_backend in ("mps", "metal", "apple")
|
||||
rocm = system_backend == "rocm"
|
||||
|
||||
# Consumer AMD Radeon (RDNA, gfx10/11/12): the practical local serving path
|
||||
# is GGUF via llama.cpp. vLLM/SGLang on ROCm are validated for datacenter
|
||||
# Instinct (CDNA, gfx9xx) but are unreliable on consumer RDNA — AWQ kernels
|
||||
@@ -589,6 +595,14 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
|
||||
if native_q.startswith("mlx-") or "mlx" in (m.get("name") or "").lower():
|
||||
continue
|
||||
|
||||
# ROCm support for vLLM/SGLang quantized safetensors is too brittle to
|
||||
# recommend blindly in the default scan. Keep AWQ/GPTQ/FP8 discoverable
|
||||
# only when the user explicitly picks that format from the quant filter;
|
||||
# otherwise prefer GGUF/Q* entries that Odysseus can route through
|
||||
# llama.cpp/Ollama without pretending "fits VRAM" means "servable".
|
||||
if rocm and is_prequantized(m) and not filter_native:
|
||||
continue
|
||||
|
||||
# On Apple Silicon the only serving engines are llama.cpp and Ollama,
|
||||
# both GGUF-only (vLLM/SGLang are CUDA/ROCm and don't run on macOS). So
|
||||
# a model is Metal-servable ONLY if it ships a real GGUF. Drop everything
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import os
|
||||
import platform
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import time
|
||||
@@ -104,6 +105,8 @@ def _detect_nvidia():
|
||||
return None
|
||||
|
||||
gpus = []
|
||||
# Devices nvidia-smi lists with a real name but a non-numeric memory.total.
|
||||
unified = []
|
||||
# nvidia-smi lists GPUs in index order (0,1,2,...), so the row position is
|
||||
# the CUDA device index we'd pass to CUDA_VISIBLE_DEVICES.
|
||||
for idx, line in enumerate(out.strip().split("\n")):
|
||||
@@ -113,9 +116,32 @@ def _detect_nvidia():
|
||||
vram_mb = float(parts[0])
|
||||
gpus.append({"index": idx, "name": parts[1], "vram_gb": vram_mb / 1024.0})
|
||||
except ValueError:
|
||||
# Grace Blackwell GB10 / DGX Spark and other unified-memory
|
||||
# NVIDIA parts report memory.total as "[N/A]"/"Not Supported"
|
||||
# because the GPU shares the system LPDDR pool instead of
|
||||
# carrying discrete VRAM. Don't drop the device — remember it so
|
||||
# we report a unified-memory GPU below rather than "No GPU" (#1340).
|
||||
if parts[1]:
|
||||
unified.append({"index": idx, "name": parts[1]})
|
||||
continue
|
||||
|
||||
if not gpus:
|
||||
if unified:
|
||||
# Unified-memory CUDA box: report the GPU backed by system RAM so the
|
||||
# Cookbook recommends models and serving works. The pool is shared
|
||||
# (not per-GPU discrete VRAM), so report the RAM total once.
|
||||
ram_gb = round(_get_ram_gb(), 1)
|
||||
gpus = [{"index": g["index"], "name": g["name"], "vram_gb": ram_gb} for g in unified]
|
||||
return {
|
||||
"gpu_name": gpus[0]["name"],
|
||||
"gpu_vram_gb": ram_gb,
|
||||
"gpu_count": len(gpus),
|
||||
"gpus": gpus,
|
||||
"gpu_groups": _group_gpus(gpus),
|
||||
"homogeneous": True,
|
||||
"backend": "cuda",
|
||||
"unified_memory": True,
|
||||
}
|
||||
return None
|
||||
total_vram = sum(g["vram_gb"] for g in gpus)
|
||||
groups = _group_gpus(gpus)
|
||||
@@ -130,6 +156,33 @@ def _detect_nvidia():
|
||||
}
|
||||
|
||||
|
||||
def classify_amd_gfx(gfx):
|
||||
"""Map an AMD ISA target (e.g. "gfx1200") to (gfx, family).
|
||||
|
||||
family is one of:
|
||||
"rdna" — consumer Radeon RX (gfx10xx RDNA1/2, gfx11xx RDNA3, gfx12xx RDNA4)
|
||||
"cdna" — datacenter Instinct (gfx908 MI100, gfx90a MI200, gfx94x/95x MI300+)
|
||||
"gcn" — older GCN/Vega (gfx900/906)
|
||||
"unknown" — empty/unrecognized; callers must treat conservatively
|
||||
|
||||
This drives the serving decision: vLLM/SGLang on ROCm are validated on CDNA
|
||||
but fragile on consumer RDNA (AWQ kernels largely unsupported, FP8 needs
|
||||
out-of-tree patches), so RDNA is steered to GGUF/llama.cpp.
|
||||
"""
|
||||
gfx = (gfx or "").lower().strip()
|
||||
m = re.fullmatch(r"gfx(\d+[a-f]?)", gfx)
|
||||
if not m:
|
||||
return "", "unknown"
|
||||
digits = m.group(1)
|
||||
if digits[:2] in ("10", "11", "12"):
|
||||
return gfx, "rdna"
|
||||
if digits in ("908", "90a") or digits[:2] in ("94", "95"):
|
||||
return gfx, "cdna"
|
||||
if digits[:1] == "9":
|
||||
return gfx, "gcn"
|
||||
return gfx, "unknown"
|
||||
|
||||
|
||||
def _detect_amd():
|
||||
"""Detect AMD GPUs. Handles both discrete cards (with mem_info_vram_total)
|
||||
and APUs / unified-memory SoCs like Strix Halo (which expose
|
||||
@@ -155,6 +208,17 @@ def _detect_amd():
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
def _amd_arch():
|
||||
"""Best-effort AMD GPU ISA + family from rocminfo.
|
||||
|
||||
rocminfo is the source of truth; its GPU agents report a `Name: gfxNNNN`
|
||||
line (CPU agents report a brand string, not a gfx target), so the first
|
||||
gfx match is the GPU ISA. Returns (gfx, family) — see classify_amd_gfx.
|
||||
"""
|
||||
info = _run(["rocminfo"]) or _run(["/opt/rocm/bin/rocminfo"]) or ""
|
||||
m = re.search(r"gfx\d+[a-f]?", info)
|
||||
return classify_amd_gfx(m.group(0) if m else "")
|
||||
|
||||
try:
|
||||
cards = []
|
||||
is_apu = False
|
||||
@@ -187,6 +251,7 @@ def _detect_amd():
|
||||
return None
|
||||
total_vram = sum(c["vram_gb"] for c in cards)
|
||||
groups = _group_gpus(cards)
|
||||
gfx, family = _amd_arch()
|
||||
# NOTE: for APUs with BIOS UMA carveout (e.g. Strix Halo), vis_vram_total
|
||||
# is the real usable GPU memory — it's physically backed but reserved
|
||||
# by BIOS so it doesn't appear in /proc/meminfo. Don't cap it at system
|
||||
@@ -200,6 +265,13 @@ def _detect_amd():
|
||||
"homogeneous": len(groups) <= 1,
|
||||
"backend": "rocm",
|
||||
"unified_memory": is_apu,
|
||||
# AMD ISA/family so downstream can tell datacenter Instinct (CDNA,
|
||||
# where vLLM/SGLang run AWQ/GPTQ reliably) from consumer Radeon
|
||||
# (RDNA, where the practical path is GGUF via llama.cpp). Empty/
|
||||
# "unknown" when rocminfo isn't available — callers must treat
|
||||
# unknown conservatively, not assume vLLM works.
|
||||
"gpu_arch": gfx,
|
||||
"gpu_family": family,
|
||||
}
|
||||
except Exception:
|
||||
return None
|
||||
@@ -409,7 +481,7 @@ def _detect_windows():
|
||||
" $gpus = @(); "
|
||||
" foreach ($line in $nv -split \"`n\") { "
|
||||
" $p = $line -split ','; "
|
||||
" if ($p.Count -ge 2) { $gpus += @{name=$p[1].Trim(); vram_mb=[double]$p[0].Trim()} } "
|
||||
" if ($p.Count -ge 2) { $gpus += [pscustomobject]@{name=$p[1].Trim(); vram_mb=[double]$p[0].Trim()} } "
|
||||
" }; "
|
||||
" $r.gpu_name = $gpus[0].name; "
|
||||
" $r.gpu_vram_gb = [math]::Round(($gpus | Measure-Object -Property vram_mb -Sum).Sum / 1024, 1); "
|
||||
|
||||
@@ -5,7 +5,9 @@ import re
|
||||
QUANT_HIERARCHY = ["Q8_0", "Q6_K", "Q5_K_M", "Q4_K_M", "Q3_K_M", "Q2_K"]
|
||||
|
||||
QUANT_BPP = {
|
||||
"F32": 4.0, "F16": 2.0, "BF16": 2.0, "FP8": 1.0, "INT8": 1.0, "NVFP4": 0.5,
|
||||
"F32": 4.0, "F16": 2.0, "BF16": 2.0, "FP8": 1.0,
|
||||
"FP4": 0.50, "NVFP4": 0.50, "MXFP4": 0.50, "NF4": 0.50,
|
||||
"INT4": 0.50, "INT8": 1.0, "W4A16": 0.50, "W8A8": 1.0, "W8A16": 1.0,
|
||||
"Q8_0": 1.05, "Q6_K": 0.80, "Q5_K_M": 0.68,
|
||||
"Q4_K_M": 0.58, "Q4_0": 0.58, "Q3_K_M": 0.48, "Q2_K": 0.37,
|
||||
"AWQ-4bit": 0.50, "AWQ-8bit": 1.0,
|
||||
@@ -14,7 +16,9 @@ QUANT_BPP = {
|
||||
}
|
||||
|
||||
QUANT_SPEED_MULT = {
|
||||
"F16": 0.6, "BF16": 0.6, "FP8": 0.85, "INT8": 0.85, "NVFP4": 1.1,
|
||||
"F16": 0.6, "BF16": 0.6, "FP8": 0.85,
|
||||
"FP4": 1.15, "NVFP4": 1.15, "MXFP4": 1.15, "NF4": 1.10,
|
||||
"INT4": 1.15, "INT8": 0.85, "W4A16": 1.15, "W8A8": 0.85, "W8A16": 0.85,
|
||||
"Q8_0": 0.8, "Q6_K": 0.95, "Q5_K_M": 1.0,
|
||||
"Q4_K_M": 1.15, "Q4_0": 1.15, "Q3_K_M": 1.25, "Q2_K": 1.35,
|
||||
"AWQ-4bit": 1.2, "AWQ-8bit": 0.85,
|
||||
@@ -23,8 +27,10 @@ QUANT_SPEED_MULT = {
|
||||
}
|
||||
|
||||
QUANT_QUALITY_PENALTY = {
|
||||
"F16": 0.0, "BF16": 0.0, "FP8": 0.0, "INT8": 0.0, "NVFP4": -0.5,
|
||||
"Q8_0": -0.5, "Q6_K": -1.5, "Q5_K_M": -2.5,
|
||||
"F16": 0.0, "BF16": 0.0, "FP8": 0.0,
|
||||
"FP4": -3.0, "NVFP4": -3.0, "MXFP4": -3.0, "NF4": -4.0,
|
||||
"INT4": -4.0, "INT8": 0.0, "W4A16": -4.0, "W8A8": 0.0, "W8A16": 0.0,
|
||||
"Q8_0": 0.0, "Q6_K": -1.0, "Q5_K_M": -2.0,
|
||||
"Q4_K_M": -5.0, "Q4_0": -5.0, "Q3_K_M": -8.0, "Q2_K": -12.0,
|
||||
# Bare "AWQ" and "AWQ-8bit" used to be 0.0 (tied with FP8). In practice
|
||||
# AWQ-anything is a calibrated reconstruction, not raw 8-bit weights —
|
||||
@@ -36,7 +42,9 @@ QUANT_QUALITY_PENALTY = {
|
||||
}
|
||||
|
||||
QUANT_BYTES_PER_PARAM = {
|
||||
"F16": 2.0, "BF16": 2.0, "FP8": 1.0, "INT8": 1.0, "NVFP4": 0.5,
|
||||
"F16": 2.0, "BF16": 2.0, "FP8": 1.0,
|
||||
"FP4": 0.5, "NVFP4": 0.5, "MXFP4": 0.5, "NF4": 0.5,
|
||||
"INT4": 0.5, "INT8": 1.0, "W4A16": 0.5, "W8A8": 1.0, "W8A16": 1.0,
|
||||
"Q8_0": 1.0, "Q6_K": 0.75, "Q5_K_M": 0.625,
|
||||
"Q4_K_M": 0.5, "Q4_0": 0.5, "Q3_K_M": 0.375, "Q2_K": 0.25,
|
||||
"AWQ-4bit": 0.5, "AWQ-8bit": 1.0,
|
||||
@@ -44,8 +52,55 @@ QUANT_BYTES_PER_PARAM = {
|
||||
"mlx-4bit": 0.5, "mlx-8bit": 1.0, "mlx-6bit": 0.75,
|
||||
}
|
||||
|
||||
# Pre-quantized formats that should NOT go through the GGUF quant hierarchy
|
||||
PREQUANTIZED_PREFIXES = ("AWQ-", "GPTQ-", "mlx-", "FP8", "INT8", "NVFP4")
|
||||
# Pre-quantized formats that should NOT go through the GGUF quant hierarchy.
|
||||
# These are native HF/vLLM-style repos, not llama.cpp GGUF quant tiers.
|
||||
PREQUANTIZED_PREFIXES = (
|
||||
"AWQ-", "GPTQ-", "mlx-", "FP8", "FP4", "NVFP4", "MXFP4", "NF4",
|
||||
"INT4", "INT8", "W4A16", "W8A8", "W8A16",
|
||||
)
|
||||
|
||||
|
||||
def infer_quantization_from_name(name):
|
||||
n = (name or "").lower()
|
||||
if "nvfp4" in n:
|
||||
return "NVFP4"
|
||||
if "mxfp4" in n:
|
||||
return "MXFP4"
|
||||
if re.search(r"(^|[-_/])nf4($|[-_/])", n):
|
||||
return "NF4"
|
||||
if re.search(r"(^|[-_/])fp4($|[-_/])", n):
|
||||
return "FP4"
|
||||
if re.search(r"(^|[-_/])w4a16($|[-_/])", n):
|
||||
return "W4A16"
|
||||
if re.search(r"(^|[-_/])w8a8($|[-_/])", n):
|
||||
return "W8A8"
|
||||
if re.search(r"(^|[-_/])w8a16($|[-_/])", n):
|
||||
return "W8A16"
|
||||
is8 = "8bit" in n or "8-bit" in n or "int8" in n
|
||||
if "awq" in n:
|
||||
return "AWQ-8bit" if is8 else "AWQ-4bit"
|
||||
if "gptq" in n:
|
||||
return "GPTQ-Int8" if is8 else "GPTQ-Int4"
|
||||
if "mlx" in n:
|
||||
if "6bit" in n:
|
||||
return "mlx-6bit"
|
||||
return "mlx-8bit" if is8 else "mlx-4bit"
|
||||
if "fp8" in n:
|
||||
return "FP8"
|
||||
if "int4" in n or "4bit" in n or "4-bit" in n:
|
||||
return "INT4"
|
||||
if "int8" in n or "8bit" in n or "8-bit" in n:
|
||||
return "INT8"
|
||||
return ""
|
||||
|
||||
|
||||
def _normalize_model_entry(model):
|
||||
if not isinstance(model, dict):
|
||||
return model
|
||||
inferred = infer_quantization_from_name(model.get("name", ""))
|
||||
if inferred and (model.get("quantization") in (None, "", "Q4_K_M") or model.get("_discovered")):
|
||||
model["quantization"] = inferred
|
||||
return model
|
||||
|
||||
|
||||
def is_prequantized(model):
|
||||
@@ -72,7 +127,13 @@ def params_b(model):
|
||||
pc = pc.strip().upper()
|
||||
m = re.match(r"^([\d.]+)\s*([BKMGT]?)$", pc)
|
||||
if m:
|
||||
val = float(m.group(1))
|
||||
try:
|
||||
val = float(m.group(1))
|
||||
except ValueError:
|
||||
# Malformed count like "1.5.3B" — [\d.]+ matches but float()
|
||||
# rejects it. One bad catalog row must not abort the whole
|
||||
# ranking pass, so treat it as unknown size.
|
||||
return 0.0
|
||||
suffix = m.group(2)
|
||||
if suffix == "B":
|
||||
return val
|
||||
@@ -180,7 +241,7 @@ def get_models():
|
||||
data_path = os.path.join(os.path.dirname(__file__), "data", "hf_models.json")
|
||||
try:
|
||||
with open(data_path, encoding="utf-8") as f:
|
||||
_models_cache = json.load(f)
|
||||
_models_cache = [_normalize_model_entry(m) for m in json.load(f)]
|
||||
except (FileNotFoundError, json.JSONDecodeError):
|
||||
_models_cache = []
|
||||
return _models_cache
|
||||
|
||||
@@ -0,0 +1,229 @@
|
||||
"""Compute intelligent llama.cpp serve profiles from detected hardware.
|
||||
|
||||
Given a system (VRAM/RAM/arch) and a model, produce 1-4 ready-to-launch
|
||||
profiles — Quality / Balanced / Speed — with concrete llama.cpp flags
|
||||
(n_gpu_layers, n_cpu_moe, cache-type, context). This turns the by-hand tuning
|
||||
(how many MoE layers fit on the GPU, when to spend VRAM on a q8 KV cache vs more
|
||||
context, how much headroom to leave for a vision encoder) into a formula.
|
||||
|
||||
Pure/deterministic — no benchmarking, no I/O. Reuses the same VRAM math as
|
||||
fit.py/models.py so "what the Cookbook recommends" and "what it serves" agree.
|
||||
|
||||
NOTE: token/s figures are NOT computed here — real speed on partial-offload MoE
|
||||
is CPU-bound and not reliably predictable from specs. The UI labels profiles by
|
||||
their tradeoff (Quality/Balanced/Speed), and the VRAM fit (the part that decides
|
||||
whether it even loads) is what's computed from real numbers.
|
||||
"""
|
||||
|
||||
from services.hwfit.models import (
|
||||
QUANT_BPP,
|
||||
params_b,
|
||||
_active_params_b,
|
||||
is_prequantized,
|
||||
)
|
||||
|
||||
# GGUF KV-cache cost per token, in bytes-per-active-billion-param, by cache type.
|
||||
# q4_0 is ~half of q8_0 is ~half of f16. The 8e-6 base in estimate_memory_gb is
|
||||
# the q8_0-ish figure; scale from there.
|
||||
_KV_FACTOR = {"q4_0": 0.5, "q8_0": 1.0, "f16": 2.0}
|
||||
|
||||
# Quant ladder from highest quality/size down. A profile that wants "best quant
|
||||
# that fits fully on GPU" walks this until one fits.
|
||||
_QUANT_LADDER = ["Q8_0", "Q6_K", "Q5_K_M", "Q4_K_M", "Q3_K_M", "Q2_K"]
|
||||
|
||||
|
||||
def _weights_gb(model, quant, fixed_gb=None):
|
||||
"""VRAM for the full weights. When fixed_gb is given (serving a specific GGUF
|
||||
file already on disk), use its real size — the quant is whatever the file is,
|
||||
not something we get to pick."""
|
||||
if fixed_gb and fixed_gb > 0:
|
||||
return float(fixed_gb)
|
||||
return params_b(model) * QUANT_BPP.get(quant, 0.58)
|
||||
|
||||
|
||||
def _kv_gb(model, ctx, kv_type):
|
||||
"""KV-cache VRAM at a context length and cache type."""
|
||||
kv_params = _active_params_b(model)
|
||||
return 0.000008 * kv_params * ctx * _KV_FACTOR.get(kv_type, 1.0)
|
||||
|
||||
|
||||
def _n_layers(model):
|
||||
"""Best-effort total transformer block count (for n-cpu-moe math)."""
|
||||
for k in ("num_hidden_layers", "n_layers", "num_layers", "block_count"):
|
||||
v = model.get(k)
|
||||
if isinstance(v, (int, float)) and v > 0:
|
||||
return int(v)
|
||||
# Fallback heuristic by size — most MoE/dense LLMs land 28-64 layers.
|
||||
pb = params_b(model)
|
||||
if pb >= 60:
|
||||
return 64
|
||||
if pb >= 25:
|
||||
return 48
|
||||
if pb >= 12:
|
||||
return 40
|
||||
return 32
|
||||
|
||||
|
||||
def _cpu_moe_for_budget(model, quant, kv_gb, vram_budget_gb, fixed_gb=None):
|
||||
"""How many MoE layers must move to CPU so weights+KV fit vram_budget_gb.
|
||||
|
||||
Returns (n_cpu_moe, fits_fully). When the model already fits, n_cpu_moe=0.
|
||||
Each offloaded layer frees roughly weights/n_layers of VRAM. We only model
|
||||
this for MoE (where --n-cpu-moe applies); dense models just report whether
|
||||
they fit at the given n_gpu_layers=999.
|
||||
"""
|
||||
weights = _weights_gb(model, quant, fixed_gb)
|
||||
needed = weights + kv_gb + 0.6 # +0.6 GB runtime/compute buffers
|
||||
if needed <= vram_budget_gb:
|
||||
return 0, True
|
||||
if not model.get("is_moe"):
|
||||
# Dense: no per-expert offload knob; either it fits or it spills via -ngl.
|
||||
return 0, False
|
||||
layers = _n_layers(model)
|
||||
per_layer = weights / max(layers, 1)
|
||||
overflow = needed - vram_budget_gb
|
||||
import math
|
||||
n = math.ceil(overflow / max(per_layer, 1e-6))
|
||||
n = max(0, min(n, layers)) # clamp
|
||||
return n, False
|
||||
|
||||
|
||||
def compute_serve_profiles(system, model, serve_weights_gb=None, serve_quant=None):
|
||||
"""Return a list of profile dicts for llama.cpp serving of `model` on `system`.
|
||||
|
||||
Each profile: {key, label, quant, n_gpu_layers, n_cpu_moe, cache_type, ctx,
|
||||
est_vram_gb, fits, note}. Empty list if no GGUF path makes
|
||||
sense (caller should fall back to manual flags).
|
||||
|
||||
DOWNLOAD mode (default): the quant isn't chosen yet, so profiles vary it
|
||||
(Quality=Q6, Balanced=Q4, Speed=Q2…) to show download options.
|
||||
|
||||
SERVE mode (serve_weights_gb set): a specific GGUF file already exists on
|
||||
disk — its quant is FIXED. Profiles then keep that quant/size and differ only
|
||||
in the actual serving knobs (n_cpu_moe, KV-cache type, context). serve_quant
|
||||
is the file's quant label (e.g. "Q4_K_M") just for display.
|
||||
"""
|
||||
vram = float(system.get("gpu_vram_gb") or 0)
|
||||
if vram <= 0:
|
||||
return []
|
||||
|
||||
serve_mode = bool(serve_weights_gb and serve_weights_gb > 0)
|
||||
|
||||
# Never propose more context than the model was trained for — asking llama.cpp
|
||||
# for ctx > n_ctx_train triggers a "training context overflow" and, with a
|
||||
# quantized KV cache, an oversized allocation that can crash the GPU
|
||||
# (radv/amdgpu ErrorDeviceLost). Cap every profile at the model's real limit.
|
||||
model_ctx_max = 0
|
||||
for k in ("context_length", "max_position_embeddings", "n_ctx_train", "context"):
|
||||
v = model.get(k)
|
||||
if isinstance(v, (int, float)) and v > 0:
|
||||
model_ctx_max = int(v)
|
||||
break
|
||||
if model_ctx_max <= 0:
|
||||
model_ctx_max = 131072 # conservative default when the catalog omits it
|
||||
|
||||
# Vision models need headroom for the image encoder (~1 GB on top of weights).
|
||||
is_vision = bool(
|
||||
model.get("is_multimodal") or model.get("vision") or model.get("mmproj")
|
||||
or "vl" in str(model.get("name", "")).lower()
|
||||
)
|
||||
headroom = 1.1 if is_vision else 0.4
|
||||
budget = max(vram - headroom, 1.0)
|
||||
|
||||
# Prequantized (AWQ/GPTQ/FP8) served via GGUF fallback use a fixed ~Q4 quant;
|
||||
# GGUF models can pick their quant. Pick a sensible per-profile quant.
|
||||
fixed_quant = model.get("quantization") if is_prequantized(model) else None
|
||||
|
||||
is_moe = bool(model.get("is_moe"))
|
||||
|
||||
def _pick_quant(prefer, require_full_fit):
|
||||
"""Choose a quant for a profile.
|
||||
|
||||
- fixed_quant (AWQ/GPTQ/FP8 served via GGUF): always that.
|
||||
- require_full_fit=True (Speed): walk DOWN from `prefer` to the best quant
|
||||
whose weights fit fully on the GPU (no offload) — fastest.
|
||||
- require_full_fit=False (Quality on MoE): keep `prefer` even if it must
|
||||
offload experts to CPU; that's the whole point of n-cpu-moe on a card
|
||||
too small to hold the weights. For dense models we can't offload
|
||||
per-expert, so fall back to the largest fully-fitting quant.
|
||||
"""
|
||||
if fixed_quant:
|
||||
return fixed_quant
|
||||
start = _QUANT_LADDER.index(prefer) if prefer in _QUANT_LADDER else 3
|
||||
if require_full_fit or not is_moe:
|
||||
for q in _QUANT_LADDER[start:]:
|
||||
if _weights_gb(model, q) + 0.6 <= budget:
|
||||
return q
|
||||
return _QUANT_LADDER[-1]
|
||||
# MoE quality: keep the preferred (big) quant; offload handles overflow.
|
||||
return prefer
|
||||
|
||||
if serve_mode:
|
||||
# Fixed file on disk — quant can't change. Vary only the serving knobs.
|
||||
fq = serve_quant or model.get("quantization") or "GGUF"
|
||||
specs = [
|
||||
# key, label, prefer_quant, full_fit, kv_type, ctx, note
|
||||
("quality", "Quality", fq, False, "q8_0", 131072,
|
||||
"Sharp q8 KV cache + full context. Best long-context accuracy; offloads MoE layers to CPU if needed."),
|
||||
("balanced", "Balanced", fq, False, "q4_0", 131072,
|
||||
"Compact q4 KV at full context — good speed/quality mix."),
|
||||
("speed", "Speed", fq, False, "q4_0", 32768,
|
||||
"Trimmed context + light KV for the fastest tokens/s."),
|
||||
]
|
||||
else:
|
||||
specs = [
|
||||
# key, label, prefer_quant, full_fit, kv_type, ctx, note
|
||||
("quality", "Quality", "Q6_K", False, "q8_0", 131072,
|
||||
"Biggest quant + sharp q8 KV cache. Best answers; offloads MoE layers to CPU if needed."),
|
||||
("balanced", "Balanced", "Q4_K_M", False, "q4_0", 131072,
|
||||
"Q4 weights + compact q4 KV. Good speed/quality mix at full context."),
|
||||
("speed", "Speed", "Q4_K_M", True, "q4_0", 32768,
|
||||
"Smallest offload + trimmed context for the fastest tokens/s."),
|
||||
]
|
||||
|
||||
profiles = []
|
||||
for key, label, prefer_q, full_fit, kv_type, ctx, note in specs:
|
||||
# In serve mode the quant is fixed (the file's); in download mode we pick.
|
||||
quant = prefer_q if serve_mode else _pick_quant(prefer_q, full_fit)
|
||||
# Shrink context if even the chosen KV won't fit alongside weights.
|
||||
# Start from the smaller of the profile's target and the model's limit.
|
||||
cur_ctx = min(ctx, model_ctx_max)
|
||||
while cur_ctx >= 8192:
|
||||
kv = _kv_gb(model, cur_ctx, kv_type)
|
||||
n_cpu_moe, fits = _cpu_moe_for_budget(model, quant, kv, budget, fixed_gb=serve_weights_gb)
|
||||
est = _weights_gb(model, quant, serve_weights_gb) + kv + 0.6
|
||||
# If a non-MoE model can't fit even fully offloaded, try less context.
|
||||
if model.get("is_moe") or fits or cur_ctx <= 8192:
|
||||
profiles.append({
|
||||
"key": key,
|
||||
"label": label,
|
||||
"quant": quant,
|
||||
"n_gpu_layers": 999,
|
||||
"n_cpu_moe": n_cpu_moe,
|
||||
"cache_type": kv_type,
|
||||
"ctx": cur_ctx,
|
||||
# When experts offload, GPU-resident VRAM tops out at the
|
||||
# budget (weights beyond it live in system RAM), so cap the
|
||||
# estimate at `budget`, not the full card — this also leaves
|
||||
# the vision-encoder headroom visible in the number.
|
||||
"est_vram_gb": round(min(est, budget), 1),
|
||||
# For MoE we treat it as fitting via offload; report whether
|
||||
# it fit WITHOUT offload as the "clean" flag.
|
||||
"fits": fits or bool(model.get("is_moe")),
|
||||
"offloads": n_cpu_moe > 0,
|
||||
"note": note,
|
||||
})
|
||||
break
|
||||
cur_ctx //= 2
|
||||
|
||||
# De-dupe identical profiles (e.g. tiny model where all three collapse to the
|
||||
# same all-GPU config) — keep the first/highest-quality label.
|
||||
seen = set()
|
||||
deduped = []
|
||||
for p in profiles:
|
||||
sig = (p["quant"], p["n_cpu_moe"], p["cache_type"], p["ctx"])
|
||||
if sig in seen:
|
||||
continue
|
||||
seen.add(sig)
|
||||
deduped.append(p)
|
||||
return deduped
|
||||
Reference in New Issue
Block a user