mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-15 17:25:26 -04:00
b20cea347a
Allow serve-profile generation for models whose trained context window is below 8192 while preserving the 8K shrink floor for larger models.
236 lines
11 KiB
Python
236 lines
11 KiB
Python
"""Compute intelligent llama.cpp serve profiles from detected hardware.
|
|
|
|
Given a system (VRAM/RAM/arch) and a model, produce 1-4 ready-to-launch
|
|
profiles — Quality / Balanced / Speed — with concrete llama.cpp flags
|
|
(n_gpu_layers, n_cpu_moe, cache-type, context). This turns the by-hand tuning
|
|
(how many MoE layers fit on the GPU, when to spend VRAM on a q8 KV cache vs more
|
|
context, how much headroom to leave for a vision encoder) into a formula.
|
|
|
|
Pure/deterministic — no benchmarking, no I/O. Reuses the same VRAM math as
|
|
fit.py/models.py so "what the Cookbook recommends" and "what it serves" agree.
|
|
|
|
NOTE: token/s figures are NOT computed here — real speed on partial-offload MoE
|
|
is CPU-bound and not reliably predictable from specs. The UI labels profiles by
|
|
their tradeoff (Quality/Balanced/Speed), and the VRAM fit (the part that decides
|
|
whether it even loads) is what's computed from real numbers.
|
|
"""
|
|
|
|
from services.hwfit.models import (
|
|
QUANT_BPP,
|
|
params_b,
|
|
_active_params_b,
|
|
is_prequantized,
|
|
)
|
|
|
|
# GGUF KV-cache cost per token, in bytes-per-active-billion-param, by cache type.
|
|
# q4_0 is ~half of q8_0 is ~half of f16. The 8e-6 base in estimate_memory_gb is
|
|
# the q8_0-ish figure; scale from there.
|
|
_KV_FACTOR = {"q4_0": 0.5, "q8_0": 1.0, "f16": 2.0}
|
|
|
|
# Quant ladder from highest quality/size down. A profile that wants "best quant
|
|
# that fits fully on GPU" walks this until one fits.
|
|
_QUANT_LADDER = ["Q8_0", "Q6_K", "Q5_K_M", "Q4_K_M", "Q3_K_M", "Q2_K"]
|
|
|
|
|
|
def _weights_gb(model, quant, fixed_gb=None):
|
|
"""VRAM for the full weights. When fixed_gb is given (serving a specific GGUF
|
|
file already on disk), use its real size — the quant is whatever the file is,
|
|
not something we get to pick."""
|
|
if fixed_gb and fixed_gb > 0:
|
|
return float(fixed_gb)
|
|
return params_b(model) * QUANT_BPP.get(quant, 0.58)
|
|
|
|
|
|
def _kv_gb(model, ctx, kv_type):
|
|
"""KV-cache VRAM at a context length and cache type."""
|
|
kv_params = _active_params_b(model)
|
|
return 0.000008 * kv_params * ctx * _KV_FACTOR.get(kv_type, 1.0)
|
|
|
|
|
|
def _n_layers(model):
|
|
"""Best-effort total transformer block count (for n-cpu-moe math)."""
|
|
for k in ("num_hidden_layers", "n_layers", "num_layers", "block_count"):
|
|
v = model.get(k)
|
|
if isinstance(v, (int, float)) and v > 0:
|
|
return int(v)
|
|
# Fallback heuristic by size — most MoE/dense LLMs land 28-64 layers.
|
|
pb = params_b(model)
|
|
if pb >= 60:
|
|
return 64
|
|
if pb >= 25:
|
|
return 48
|
|
if pb >= 12:
|
|
return 40
|
|
return 32
|
|
|
|
|
|
def _cpu_moe_for_budget(model, quant, kv_gb, vram_budget_gb, fixed_gb=None):
|
|
"""How many MoE layers must move to CPU so weights+KV fit vram_budget_gb.
|
|
|
|
Returns (n_cpu_moe, fits_fully). When the model already fits, n_cpu_moe=0.
|
|
Each offloaded layer frees roughly weights/n_layers of VRAM. We only model
|
|
this for MoE (where --n-cpu-moe applies); dense models just report whether
|
|
they fit at the given n_gpu_layers=999.
|
|
"""
|
|
weights = _weights_gb(model, quant, fixed_gb)
|
|
needed = weights + kv_gb + 0.6 # +0.6 GB runtime/compute buffers
|
|
if needed <= vram_budget_gb:
|
|
return 0, True
|
|
if not model.get("is_moe"):
|
|
# Dense: no per-expert offload knob; either it fits or it spills via -ngl.
|
|
return 0, False
|
|
layers = _n_layers(model)
|
|
per_layer = weights / max(layers, 1)
|
|
overflow = needed - vram_budget_gb
|
|
import math
|
|
n = math.ceil(overflow / max(per_layer, 1e-6))
|
|
n = max(0, min(n, layers)) # clamp
|
|
return n, False
|
|
|
|
|
|
def compute_serve_profiles(system, model, serve_weights_gb=None, serve_quant=None):
|
|
"""Return a list of profile dicts for llama.cpp serving of `model` on `system`.
|
|
|
|
Each profile: {key, label, quant, n_gpu_layers, n_cpu_moe, cache_type, ctx,
|
|
est_vram_gb, fits, note}. Empty list if no GGUF path makes
|
|
sense (caller should fall back to manual flags).
|
|
|
|
DOWNLOAD mode (default): the quant isn't chosen yet, so profiles vary it
|
|
(Quality=Q6, Balanced=Q4, Speed=Q2…) to show download options.
|
|
|
|
SERVE mode (serve_weights_gb set): a specific GGUF file already exists on
|
|
disk — its quant is FIXED. Profiles then keep that quant/size and differ only
|
|
in the actual serving knobs (n_cpu_moe, KV-cache type, context). serve_quant
|
|
is the file's quant label (e.g. "Q4_K_M") just for display.
|
|
"""
|
|
vram = float(system.get("gpu_vram_gb") or 0)
|
|
if vram <= 0:
|
|
return []
|
|
|
|
serve_mode = bool(serve_weights_gb and serve_weights_gb > 0)
|
|
|
|
# Never propose more context than the model was trained for — asking llama.cpp
|
|
# for ctx > n_ctx_train triggers a "training context overflow" and, with a
|
|
# quantized KV cache, an oversized allocation that can crash the GPU
|
|
# (radv/amdgpu ErrorDeviceLost). Cap every profile at the model's real limit.
|
|
model_ctx_max = 0
|
|
for k in ("context_length", "max_position_embeddings", "n_ctx_train", "context"):
|
|
v = model.get(k)
|
|
if isinstance(v, (int, float)) and v > 0:
|
|
model_ctx_max = int(v)
|
|
break
|
|
if model_ctx_max <= 0:
|
|
model_ctx_max = 131072 # conservative default when the catalog omits it
|
|
|
|
# Vision models need headroom for the image encoder (~1 GB on top of weights).
|
|
is_vision = bool(
|
|
model.get("is_multimodal") or model.get("vision") or model.get("mmproj")
|
|
or "vl" in str(model.get("name", "")).lower()
|
|
)
|
|
headroom = 1.1 if is_vision else 0.4
|
|
budget = max(vram - headroom, 1.0)
|
|
|
|
# Prequantized (AWQ/GPTQ/FP8) served via GGUF fallback use a fixed ~Q4 quant;
|
|
# GGUF models can pick their quant. Pick a sensible per-profile quant.
|
|
fixed_quant = model.get("quantization") if is_prequantized(model) else None
|
|
|
|
is_moe = bool(model.get("is_moe"))
|
|
|
|
def _pick_quant(prefer, require_full_fit):
|
|
"""Choose a quant for a profile.
|
|
|
|
- fixed_quant (AWQ/GPTQ/FP8 served via GGUF): always that.
|
|
- require_full_fit=True (Speed): walk DOWN from `prefer` to the best quant
|
|
whose weights fit fully on the GPU (no offload) — fastest.
|
|
- require_full_fit=False (Quality on MoE): keep `prefer` even if it must
|
|
offload experts to CPU; that's the whole point of n-cpu-moe on a card
|
|
too small to hold the weights. For dense models we can't offload
|
|
per-expert, so fall back to the largest fully-fitting quant.
|
|
"""
|
|
if fixed_quant:
|
|
return fixed_quant
|
|
start = _QUANT_LADDER.index(prefer) if prefer in _QUANT_LADDER else 3
|
|
if require_full_fit or not is_moe:
|
|
for q in _QUANT_LADDER[start:]:
|
|
if _weights_gb(model, q) + 0.6 <= budget:
|
|
return q
|
|
return _QUANT_LADDER[-1]
|
|
# MoE quality: keep the preferred (big) quant; offload handles overflow.
|
|
return prefer
|
|
|
|
if serve_mode:
|
|
# Fixed file on disk — quant can't change. Vary only the serving knobs.
|
|
fq = serve_quant or model.get("quantization") or "GGUF"
|
|
specs = [
|
|
# key, label, prefer_quant, full_fit, kv_type, ctx, note
|
|
("quality", "Quality", fq, False, "q8_0", 131072,
|
|
"Sharp q8 KV cache + full context. Best long-context accuracy; offloads MoE layers to CPU if needed."),
|
|
("balanced", "Balanced", fq, False, "q4_0", 131072,
|
|
"Compact q4 KV at full context — good speed/quality mix."),
|
|
("speed", "Speed", fq, False, "q4_0", 32768,
|
|
"Trimmed context + light KV for the fastest tokens/s."),
|
|
]
|
|
else:
|
|
specs = [
|
|
# key, label, prefer_quant, full_fit, kv_type, ctx, note
|
|
("quality", "Quality", "Q6_K", False, "q8_0", 131072,
|
|
"Biggest quant + sharp q8 KV cache. Best answers; offloads MoE layers to CPU if needed."),
|
|
("balanced", "Balanced", "Q4_K_M", False, "q4_0", 131072,
|
|
"Q4 weights + compact q4 KV. Good speed/quality mix at full context."),
|
|
("speed", "Speed", "Q4_K_M", True, "q4_0", 32768,
|
|
"Smallest offload + trimmed context for the fastest tokens/s."),
|
|
]
|
|
|
|
profiles = []
|
|
for key, label, prefer_q, full_fit, kv_type, ctx, note in specs:
|
|
# In serve mode the quant is fixed (the file's); in download mode we pick.
|
|
quant = prefer_q if serve_mode else _pick_quant(prefer_q, full_fit)
|
|
# Shrink context if even the chosen KV won't fit alongside weights.
|
|
# Start from the smaller of the profile's target and the model's limit.
|
|
cur_ctx = min(ctx, model_ctx_max)
|
|
# Floor the context-shrink loop at 8192, but never above the model's own
|
|
# trained limit. A model with a sub-8192 context (e.g. a 2048-token
|
|
# SmolLM) starts below 8192, so a hard-coded 8192 guard skipped the loop
|
|
# entirely and produced NO profile — the serve UI then fell back to
|
|
# manual flags even though the model fits the GPU trivially.
|
|
ctx_floor = min(8192, model_ctx_max)
|
|
while cur_ctx >= ctx_floor:
|
|
kv = _kv_gb(model, cur_ctx, kv_type)
|
|
n_cpu_moe, fits = _cpu_moe_for_budget(model, quant, kv, budget, fixed_gb=serve_weights_gb)
|
|
est = _weights_gb(model, quant, serve_weights_gb) + kv + 0.6
|
|
# If a non-MoE model can't fit even fully offloaded, try less context.
|
|
if model.get("is_moe") or fits or cur_ctx <= ctx_floor:
|
|
profiles.append({
|
|
"key": key,
|
|
"label": label,
|
|
"quant": quant,
|
|
"n_gpu_layers": 999,
|
|
"n_cpu_moe": n_cpu_moe,
|
|
"cache_type": kv_type,
|
|
"ctx": cur_ctx,
|
|
# When experts offload, GPU-resident VRAM tops out at the
|
|
# budget (weights beyond it live in system RAM), so cap the
|
|
# estimate at `budget`, not the full card — this also leaves
|
|
# the vision-encoder headroom visible in the number.
|
|
"est_vram_gb": round(min(est, budget), 1),
|
|
# For MoE we treat it as fitting via offload; report whether
|
|
# it fit WITHOUT offload as the "clean" flag.
|
|
"fits": fits or bool(model.get("is_moe")),
|
|
"offloads": n_cpu_moe > 0,
|
|
"note": note,
|
|
})
|
|
break
|
|
cur_ctx //= 2
|
|
|
|
# De-dupe identical profiles (e.g. tiny model where all three collapse to the
|
|
# same all-GPU config) — keep the first/highest-quality label.
|
|
seen = set()
|
|
deduped = []
|
|
for p in profiles:
|
|
sig = (p["quant"], p["n_cpu_moe"], p["cache_type"], p["ctx"])
|
|
if sig in seen:
|
|
continue
|
|
seen.add(sig)
|
|
deduped.append(p)
|
|
return deduped
|