mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-17 02:05:22 -04:00
Odysseus v1.0
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,463 @@
|
||||
import re
|
||||
|
||||
from services.hwfit.models import (
|
||||
params_b, estimate_memory_gb, infer_use_case,
|
||||
get_models, is_prequantized, _active_params_b, QUANT_BYTES_PER_PARAM,
|
||||
QUANT_SPEED_MULT, QUANT_QUALITY_PENALTY,
|
||||
)
|
||||
|
||||
GPU_BANDWIDTH = {
|
||||
"5090": 1792, "5080": 960, "5070 ti": 896, "5070": 672, "5060 ti": 448, "5060": 256,
|
||||
"4090": 1008, "4080 super": 736, "4080": 717, "4070 ti super": 672, "4070 ti": 504, "4070 super": 504, "4070": 504, "4060 ti": 288, "4060": 272,
|
||||
"3090 ti": 1008, "3090": 936, "3080 ti": 912, "3080": 760, "3070 ti": 608, "3070": 448, "3060 ti": 448, "3060": 360,
|
||||
"2080 ti": 616, "2080 super": 496, "2080": 448, "2070 super": 448, "2070": 448, "2060 super": 448, "2060": 336,
|
||||
"1660 ti": 288, "1660 super": 336, "1660": 192, "1650 super": 192, "1650": 128,
|
||||
"h100 sxm": 3350, "h100": 2039, "h200": 4800, "a100 sxm": 2039, "a100": 1555,
|
||||
"l40s": 864, "l40": 864, "l4": 300, "a10g": 600, "a10": 600, "t4": 320,
|
||||
"v100 sxm": 900, "v100": 897, "a6000": 768, "a5000": 768, "a4000": 448,
|
||||
"7900 xtx": 960, "7900 xt": 800, "7900 gre": 576, "7800 xt": 624, "7700 xt": 432, "7600": 288,
|
||||
"6950 xt": 576, "6900 xt": 512, "6800 xt": 512, "6800": 512, "6700 xt": 384, "6600 xt": 256, "6600": 224,
|
||||
"mi300x": 5300, "mi300": 5300, "mi250x": 3277, "mi250": 3277, "mi210": 1638, "mi100": 1229,
|
||||
"9070 xt": 624, "9070": 488,
|
||||
}
|
||||
|
||||
# Pre-sort keys by length descending for correct substring matching
|
||||
_BW_KEYS_SORTED = sorted(GPU_BANDWIDTH.keys(), key=len, reverse=True)
|
||||
|
||||
FALLBACK_K = {"cuda": 220, "rocm": 180, "cpu_x86": 70, "cpu_arm": 90}
|
||||
|
||||
USE_CASE_WEIGHTS = {
|
||||
"general": (0.45, 0.30, 0.15, 0.10),
|
||||
"coding": (0.50, 0.20, 0.15, 0.15),
|
||||
"reasoning": (0.55, 0.15, 0.15, 0.15),
|
||||
"chat": (0.40, 0.35, 0.15, 0.10),
|
||||
"multimodal": (0.50, 0.20, 0.15, 0.15),
|
||||
"embedding": (0.30, 0.40, 0.20, 0.10),
|
||||
"tts": (0.40, 0.35, 0.15, 0.10),
|
||||
"stt": (0.40, 0.35, 0.15, 0.10),
|
||||
}
|
||||
|
||||
SPEED_TARGET = {
|
||||
"general": 40, "coding": 40, "multimodal": 40, "chat": 40,
|
||||
"reasoning": 25, "embedding": 200, "tts": 40, "stt": 40,
|
||||
}
|
||||
|
||||
CONTEXT_TARGET = {
|
||||
"general": 4096, "chat": 4096, "coding": 8192,
|
||||
"reasoning": 8192, "multimodal": 4096, "embedding": 512,
|
||||
"tts": 2048, "stt": 2048,
|
||||
}
|
||||
|
||||
|
||||
def _lookup_bandwidth(gpu_name):
|
||||
if not gpu_name:
|
||||
return None
|
||||
gn = gpu_name.lower()
|
||||
for key in _BW_KEYS_SORTED:
|
||||
if key in gn:
|
||||
return GPU_BANDWIDTH[key]
|
||||
return None
|
||||
|
||||
|
||||
def _estimate_speed(model, quant, run_mode, system):
|
||||
"""Estimate tok/s. Uses active params for MoE (only active experts run per token)."""
|
||||
pb = _active_params_b(model)
|
||||
is_moe = model.get("is_moe", False)
|
||||
bw = _lookup_bandwidth(system.get("gpu_name"))
|
||||
backend = system.get("backend", "cpu_x86")
|
||||
|
||||
if bw and run_mode in ("gpu", "cpu_offload"):
|
||||
bpp = QUANT_BYTES_PER_PARAM.get(quant, 0.5)
|
||||
model_gb = pb * bpp
|
||||
if model_gb <= 0:
|
||||
return 0.0
|
||||
efficiency = 0.55
|
||||
raw_tps = (bw / model_gb) * efficiency
|
||||
if run_mode == "cpu_offload":
|
||||
mode_factor = 0.5
|
||||
elif is_moe:
|
||||
mode_factor = 0.8
|
||||
else:
|
||||
mode_factor = 1.0
|
||||
return raw_tps * mode_factor
|
||||
|
||||
k = FALLBACK_K.get(backend, 70)
|
||||
if pb <= 0:
|
||||
return 0.0
|
||||
sm = QUANT_SPEED_MULT.get(quant, 1.0)
|
||||
return k / pb * sm
|
||||
|
||||
|
||||
def _quality_score(model, quant, use_case):
|
||||
pb = params_b(model)
|
||||
if pb < 1:
|
||||
base = 30
|
||||
elif pb < 3:
|
||||
base = 45
|
||||
elif pb < 7:
|
||||
base = 60
|
||||
elif pb < 10:
|
||||
base = 75
|
||||
elif pb < 20:
|
||||
base = 82
|
||||
elif pb < 40:
|
||||
base = 89
|
||||
else:
|
||||
base = 95
|
||||
|
||||
name_lower = model.get("name", "").lower()
|
||||
if "qwen" in name_lower:
|
||||
base += 2
|
||||
if "deepseek" in name_lower:
|
||||
base += 3
|
||||
if "llama" in name_lower:
|
||||
base += 2
|
||||
if "mistral" in name_lower or "mixtral" in name_lower:
|
||||
base += 1
|
||||
if "gemma" in name_lower:
|
||||
base += 1
|
||||
|
||||
base += QUANT_QUALITY_PENALTY.get(quant, 0)
|
||||
|
||||
model_uc = infer_use_case(model)
|
||||
if model_uc == "coding" and use_case == "coding":
|
||||
base += 6
|
||||
if model_uc == "reasoning" and use_case == "reasoning" and pb >= 13:
|
||||
base += 5
|
||||
if model_uc == "multimodal" and use_case == "multimodal":
|
||||
base += 6
|
||||
|
||||
return max(0, min(100, base))
|
||||
|
||||
|
||||
def _speed_score(tps, use_case):
|
||||
target = SPEED_TARGET.get(use_case, 40)
|
||||
return max(0, min(100, (tps / target) * 100))
|
||||
|
||||
|
||||
def _fit_score(required, available):
|
||||
if required > available:
|
||||
return 0
|
||||
if available <= 0:
|
||||
return 0
|
||||
ratio = required / available
|
||||
if ratio <= 0.5:
|
||||
return 60 + (ratio / 0.5) * 40
|
||||
if ratio <= 0.8:
|
||||
return 100
|
||||
if ratio <= 0.9:
|
||||
return 70
|
||||
return 50
|
||||
|
||||
|
||||
def _context_score(ctx, use_case):
|
||||
target = CONTEXT_TARGET.get(use_case, 4096)
|
||||
if ctx >= target:
|
||||
return 100
|
||||
if ctx >= target / 2:
|
||||
return 70
|
||||
return 30
|
||||
|
||||
|
||||
def _try_quant_at(model, quant, ctx, gpu_vram, available_ram):
|
||||
"""Try a specific quant at a given context. Returns (run_mode, quant, ctx, mem) or None."""
|
||||
mem = estimate_memory_gb(model, quant, ctx)
|
||||
if gpu_vram > 0 and mem <= gpu_vram:
|
||||
return "gpu", quant, ctx, mem
|
||||
if gpu_vram > 0 and mem <= available_ram:
|
||||
return "cpu_offload", quant, ctx, mem
|
||||
if gpu_vram <= 0 and mem <= available_ram:
|
||||
return "cpu_only", quant, ctx, mem
|
||||
# Try halving context
|
||||
cur_ctx = ctx // 2
|
||||
while cur_ctx >= 1024:
|
||||
mem = estimate_memory_gb(model, quant, cur_ctx)
|
||||
if gpu_vram > 0 and mem <= gpu_vram:
|
||||
return "gpu", quant, cur_ctx, mem
|
||||
if mem <= available_ram:
|
||||
return ("cpu_offload" if gpu_vram > 0 else "cpu_only"), quant, cur_ctx, mem
|
||||
cur_ctx //= 2
|
||||
return None
|
||||
|
||||
|
||||
def _quant_bits(q):
|
||||
"""Approximate bit-width of a quant label so GGUF quant tiers (Q4/Q8/…) can
|
||||
be matched against prequantized formats (AWQ 4, AWQ-8bit, FP8, GPTQ-4bit…).
|
||||
Returns 0 when unknown (caller treats unknown as "don't filter")."""
|
||||
qu = (q or "").upper().replace("-", "").replace("_", "").replace(" ", "")
|
||||
# GGUF k-quants + float formats
|
||||
if qu.startswith("Q8") or "FP8" in qu:
|
||||
return 8
|
||||
if qu.startswith("Q4") or qu.startswith("IQ4"):
|
||||
return 4
|
||||
if qu.startswith("Q2") or qu.startswith("IQ2"):
|
||||
return 2
|
||||
if qu.startswith("Q3") or qu.startswith("IQ3"):
|
||||
return 3
|
||||
if qu.startswith("Q5"):
|
||||
return 5
|
||||
if qu.startswith("Q6"):
|
||||
return 6
|
||||
if qu.startswith("F16") or qu.startswith("BF16") or qu.startswith("F32"):
|
||||
return 16
|
||||
# Prequantized formats: pull the bit-width digit (AWQ4 / AWQ4BIT / GPTQ8 / 4BIT / INT8 …)
|
||||
m = re.search(r"(?:AWQ|GPTQ|MLX|EXL2|BNB|INT|W)(\d{1,2})", qu) or re.search(r"(\d{1,2})BIT", qu)
|
||||
if m:
|
||||
b = int(m.group(1))
|
||||
if 2 <= b <= 16:
|
||||
return b
|
||||
return 0
|
||||
|
||||
|
||||
def analyze_model(model, system, target_quant=None):
|
||||
pb = params_b(model)
|
||||
if pb <= 0:
|
||||
return None
|
||||
|
||||
use_case = infer_use_case(model)
|
||||
has_gpu = system.get("has_gpu", False)
|
||||
gpu_vram = (system.get("gpu_vram_gb") or 0) if has_gpu else 0
|
||||
gpu_count = system.get("gpu_count", 1) or 1
|
||||
single_gpu_vram = gpu_vram / gpu_count if gpu_count > 1 else gpu_vram
|
||||
available_ram = system.get("available_ram_gb", 0)
|
||||
# When the user has explicitly picked a GPU config (not RAM mode), they want
|
||||
# to see what runs ON the GPU(s) — not big models that only "fit" by spilling
|
||||
# most layers to system RAM. Zeroing the offload budget makes _try_quant_at
|
||||
# take only its GPU branches (fit on VRAM, shrinking context if needed),
|
||||
# otherwise return None. Fixes "96 GB GPU still lists a 175 GB model".
|
||||
gpu_only = bool(system.get("gpu_only")) and has_gpu and gpu_vram > 0
|
||||
eff_ram = 0 if gpu_only else available_ram
|
||||
is_moe = model.get("is_moe", False)
|
||||
ctx = model.get("context_length", 4096) or 4096
|
||||
|
||||
native_quant = model.get("quantization", "Q4_K_M")
|
||||
preq = is_prequantized(model)
|
||||
|
||||
# GGUF models can't be sharded across GPUs — use single GPU VRAM
|
||||
is_gguf = bool(model.get("gguf_sources"))
|
||||
quant_upper = (native_quant or "").upper()
|
||||
is_gguf_quant = any(quant_upper.startswith(p) for p in ("Q2", "Q3", "Q4", "Q5", "Q6", "Q8", "IQ", "F16", "F32"))
|
||||
# Single-GPU VRAM only applies to GGUF/dense builds (llama.cpp can't shard
|
||||
# across GPUs). Prequantized formats (AWQ/GPTQ/FP8) are served sharded by
|
||||
# vLLM across all GPUs, so they get the FULL multi-GPU VRAM — even when the
|
||||
# model also lists a GGUF alternate download (gguf_sources).
|
||||
if (is_gguf or is_gguf_quant) and not preq:
|
||||
effective_vram = single_gpu_vram
|
||||
else:
|
||||
effective_vram = gpu_vram
|
||||
|
||||
# Determine which quant to evaluate at
|
||||
if preq:
|
||||
# AWQ/GPTQ/FP8/MLX come at a fixed bit-width. If the user picked a
|
||||
# specific quant tier (e.g. Q8 → 8-bit), only keep prequant models whose
|
||||
# native bit-width matches — otherwise selecting Q8 would still surface
|
||||
# AWQ-4bit models, mixing 4- and 8-bit in one view.
|
||||
if target_quant:
|
||||
_tb, _nb = _quant_bits(target_quant), _quant_bits(native_quant)
|
||||
if _tb and _nb and _tb != _nb:
|
||||
return None
|
||||
quant_to_try = native_quant
|
||||
elif target_quant:
|
||||
# User picked a specific quant
|
||||
quant_to_try = target_quant
|
||||
else:
|
||||
# Default: Q4_K_M (user's stated preference)
|
||||
quant_to_try = "Q4_K_M"
|
||||
|
||||
result = _try_quant_at(model, quant_to_try, ctx, effective_vram, eff_ram)
|
||||
|
||||
# If target quant doesn't fit and it's not pre-quantized, try lower quants
|
||||
if result is None and not preq and target_quant:
|
||||
from services.hwfit.models import QUANT_HIERARCHY
|
||||
idx = QUANT_HIERARCHY.index(target_quant) if target_quant in QUANT_HIERARCHY else -1
|
||||
for q in QUANT_HIERARCHY[idx + 1:]:
|
||||
result = _try_quant_at(model, q, ctx, effective_vram, eff_ram)
|
||||
if result:
|
||||
break
|
||||
|
||||
if result is None:
|
||||
# Model doesn't fit on the user's current hardware. Surface it
|
||||
# anyway with a "too_tight" badge instead of silently dropping
|
||||
# it — without this, editing the hardware config to try LARGER
|
||||
# tiers never revealed the bigger models, because they were
|
||||
# filtered out before the user could see what would fit. The
|
||||
# client already knows how to render too_tight (red row).
|
||||
oversized_required = estimate_memory_gb(model, quant_to_try, ctx)
|
||||
return {
|
||||
"name": model.get("name"),
|
||||
"provider": model.get("provider"),
|
||||
"parameter_count": model.get("parameter_count"),
|
||||
"params_b": round(pb, 1),
|
||||
"is_moe": is_moe,
|
||||
"use_case": use_case,
|
||||
"fit_level": "too_tight",
|
||||
"run_mode": "no_fit",
|
||||
"quant": quant_to_try,
|
||||
"context": ctx,
|
||||
"required_gb": round(oversized_required, 1),
|
||||
"speed_tps": 0,
|
||||
"score": 0,
|
||||
"scores": {"quality": 0, "speed": 0, "fit": 0, "context": 0},
|
||||
"gguf_sources": model.get("gguf_sources", []),
|
||||
"context_length": model.get("context_length", 4096),
|
||||
}
|
||||
|
||||
run_mode, quant, fit_ctx, required_gb = result
|
||||
|
||||
# Determine fit level
|
||||
budget = effective_vram if run_mode == "gpu" else available_ram
|
||||
if required_gb > budget:
|
||||
return None
|
||||
if run_mode == "gpu":
|
||||
rec = model.get("recommended_ram_gb") or required_gb
|
||||
if rec <= gpu_vram:
|
||||
fit_level = "perfect"
|
||||
elif gpu_vram >= required_gb * 1.2:
|
||||
fit_level = "good"
|
||||
else:
|
||||
fit_level = "marginal"
|
||||
elif run_mode == "cpu_offload":
|
||||
fit_level = "good" if available_ram >= required_gb * 1.2 else "marginal"
|
||||
else:
|
||||
fit_level = "marginal"
|
||||
|
||||
tps = _estimate_speed(model, quant, run_mode, system)
|
||||
|
||||
q_score = _quality_score(model, quant, use_case)
|
||||
s_score = _speed_score(tps, use_case)
|
||||
f_score = _fit_score(required_gb, budget)
|
||||
c_score = _context_score(fit_ctx, use_case)
|
||||
|
||||
wq, ws, wf, wc = USE_CASE_WEIGHTS.get(use_case, (0.45, 0.30, 0.15, 0.10))
|
||||
composite = q_score * wq + s_score * ws + f_score * wf + c_score * wc
|
||||
|
||||
return {
|
||||
"name": model.get("name"),
|
||||
"provider": model.get("provider"),
|
||||
"parameter_count": model.get("parameter_count"),
|
||||
"params_b": round(pb, 1),
|
||||
"is_moe": is_moe,
|
||||
"use_case": use_case,
|
||||
"fit_level": fit_level,
|
||||
"run_mode": run_mode,
|
||||
"quant": quant,
|
||||
"context": fit_ctx,
|
||||
"required_gb": round(required_gb, 1),
|
||||
"speed_tps": round(tps, 1),
|
||||
"score": round(composite, 1),
|
||||
"scores": {
|
||||
"quality": round(q_score, 1),
|
||||
"speed": round(s_score, 1),
|
||||
"fit": round(f_score, 1),
|
||||
"context": round(c_score, 1),
|
||||
},
|
||||
"gguf_sources": model.get("gguf_sources", []),
|
||||
"context_length": model.get("context_length", 4096),
|
||||
}
|
||||
|
||||
|
||||
SORT_KEYS = {
|
||||
"score": lambda r: r["score"],
|
||||
"speed": lambda r: r["speed_tps"],
|
||||
"vram": lambda r: r["required_gb"],
|
||||
"params": lambda r: r["params_b"],
|
||||
"context": lambda r: r["context"],
|
||||
}
|
||||
|
||||
|
||||
def rank_models(system, use_case=None, limit=50, search=None, sort="score", quant=None):
|
||||
"""Rank all models against detected hardware. Returns sorted list of fit results."""
|
||||
models = get_models()
|
||||
results = []
|
||||
|
||||
# Include image gen models only when explicitly filtered
|
||||
if use_case == "image_gen":
|
||||
try:
|
||||
from services.hwfit.image_models import rank_image_models
|
||||
except ImportError:
|
||||
rank_image_models = None
|
||||
if rank_image_models:
|
||||
img_results = rank_image_models(system, search=search)
|
||||
else:
|
||||
img_results = []
|
||||
for im in img_results:
|
||||
fit_map = {"perfect": "perfect", "good": "good", "tight": "marginal", "no_fit": "too_tight", "no_gpu": "too_tight"}
|
||||
results.append({
|
||||
"name": im["id"],
|
||||
"provider": im["provider"],
|
||||
"parameter_count": f"{im['params_b']}B",
|
||||
"params_b": im["params_b"],
|
||||
"is_moe": False,
|
||||
"use_case": "image_gen",
|
||||
"fit_level": fit_map.get(im["fit"], "too_tight"),
|
||||
"run_mode": "gpu" if im["fits"] else "no_fit",
|
||||
"quant": im.get("quant", "BF16"),
|
||||
"context": 0,
|
||||
"context_length": 0,
|
||||
"required_gb": round(im.get("vram_needed") or 0, 1),
|
||||
"speed_tps": 0,
|
||||
"score": float(im["score"]),
|
||||
"scores": {"quality": float(im["quality"]), "speed": float(im["speed"]), "fit": 0, "context": 0},
|
||||
"gguf_sources": [],
|
||||
"is_image_gen": True,
|
||||
"capabilities": im.get("capabilities", []),
|
||||
"description": im.get("description", ""),
|
||||
})
|
||||
if use_case == "image_gen":
|
||||
sort_fn = SORT_KEYS.get(sort, SORT_KEYS["score"])
|
||||
results.sort(key=sort_fn, reverse=(sort != "vram"))
|
||||
return results[:limit]
|
||||
|
||||
# If user picked a prequantized format (AWQ/FP8/GPTQ), filter to only those models
|
||||
filter_native = quant and any(quant.startswith(p) for p in ("AWQ-", "GPTQ-", "FP8"))
|
||||
|
||||
# MLX-quantized models only run on Apple Silicon (Metal). Exclude them on
|
||||
# every other backend (CUDA / ROCm / CPU) so Linux/Windows users don't see
|
||||
# unrunnable suggestions.
|
||||
system_backend = (system.get("backend") or "").lower()
|
||||
apple_silicon = system_backend in ("mps", "metal", "apple")
|
||||
|
||||
for m in models:
|
||||
native_q = m.get("quantization", "")
|
||||
|
||||
# Drop MLX models on non-Apple hardware
|
||||
if not apple_silicon and native_q.startswith("mlx-"):
|
||||
continue
|
||||
|
||||
# Format filter: AWQ tab → only AWQ models, FP8 tab → only FP8 models
|
||||
if filter_native:
|
||||
if quant == "FP8" and native_q != "FP8":
|
||||
continue
|
||||
if quant.startswith("AWQ") and not native_q.startswith("AWQ"):
|
||||
continue
|
||||
if quant.startswith("GPTQ") and not native_q.startswith("GPTQ"):
|
||||
continue
|
||||
|
||||
if search:
|
||||
name = m.get("name", "").lower()
|
||||
provider = m.get("provider", "").lower()
|
||||
if search.lower() not in name and search.lower() not in provider:
|
||||
continue
|
||||
|
||||
result = analyze_model(m, system, target_quant=quant)
|
||||
if result is None:
|
||||
continue
|
||||
|
||||
if use_case:
|
||||
model_uc = infer_use_case(m)
|
||||
if use_case != model_uc and use_case != "general":
|
||||
continue
|
||||
|
||||
results.append(result)
|
||||
|
||||
# Pick the visible SET by best fit (score) first, so it stays the same no
|
||||
# matter which column the user sorts by — otherwise sorting by params would
|
||||
# truncate to the N biggest models (huge ones that don't even fit) while
|
||||
# sorting by vram showed the N smallest. Only AFTER choosing the set do we
|
||||
# order it by the requested column.
|
||||
results.sort(key=SORT_KEYS["score"], reverse=True)
|
||||
results = results[:limit]
|
||||
sort_fn = SORT_KEYS.get(sort, SORT_KEYS["score"])
|
||||
# vram ascending (smallest first), everything else descending (biggest first)
|
||||
results.sort(key=sort_fn, reverse=(sort != "vram"))
|
||||
return results
|
||||
@@ -0,0 +1,457 @@
|
||||
import os
|
||||
import platform
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
CACHE_TTL = 1800 # 30 min — hardware rarely changes; use the Rescan button to force a re-probe
|
||||
|
||||
|
||||
_remote_host = None # set by detect_system(host=...)
|
||||
_remote_port = None # set by detect_system(ssh_port=...)
|
||||
_remote_platform = None # set by detect_system(platform=...): "windows", "linux", "termux"
|
||||
_last_gpu_error = None # set by _detect_nvidia() when nvidia-smi errors (driver mismatch, etc.)
|
||||
|
||||
|
||||
def _run(cmd):
|
||||
try:
|
||||
if _remote_host:
|
||||
# Run command on remote host via SSH
|
||||
if isinstance(cmd, list):
|
||||
cmd_str = " ".join(cmd)
|
||||
else:
|
||||
cmd_str = cmd
|
||||
ssh_cmd = ["ssh", "-o", "ConnectTimeout=5", "-o", "StrictHostKeyChecking=no"]
|
||||
if _remote_port and _remote_port != "22":
|
||||
ssh_cmd += ["-p", _remote_port]
|
||||
ssh_cmd += [_remote_host, cmd_str]
|
||||
r = subprocess.run(
|
||||
ssh_cmd,
|
||||
capture_output=True, text=True, timeout=15,
|
||||
)
|
||||
else:
|
||||
r = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
|
||||
if r.returncode == 0:
|
||||
return r.stdout.strip()
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def _group_gpus(gpus):
|
||||
"""Group identical GPUs by (name, rounded VRAM).
|
||||
|
||||
vLLM tensor-parallel only works across IDENTICAL GPUs, so a mixed box must
|
||||
be split into homogeneous pools. Each group carries the device indices so a
|
||||
serve command can pin CUDA_VISIBLE_DEVICES to exactly one pool. Biggest pool
|
||||
(by total VRAM) first — that's the sensible auto-default serving target.
|
||||
"""
|
||||
groups = {}
|
||||
order = []
|
||||
for g in gpus:
|
||||
key = (g["name"], round(g["vram_gb"]))
|
||||
if key not in groups:
|
||||
groups[key] = {
|
||||
"name": g["name"],
|
||||
"vram_each": round(g["vram_gb"], 1),
|
||||
"count": 0,
|
||||
"indices": [],
|
||||
}
|
||||
order.append(key)
|
||||
groups[key]["count"] += 1
|
||||
groups[key]["indices"].append(g.get("index"))
|
||||
out = []
|
||||
for key in order:
|
||||
grp = groups[key]
|
||||
grp["vram_total"] = round(grp["vram_each"] * grp["count"], 1)
|
||||
out.append(grp)
|
||||
out.sort(key=lambda x: x["vram_total"], reverse=True)
|
||||
return out
|
||||
|
||||
|
||||
def _detect_nvidia():
|
||||
global _last_gpu_error
|
||||
_last_gpu_error = None
|
||||
out = _run(["nvidia-smi", "--query-gpu=memory.total,name", "--format=csv,noheader,nounits"])
|
||||
# Remote fallback: a non-interactive SSH shell often has a minimal PATH
|
||||
# that omits where nvidia-smi lives (/usr/bin, /usr/local/cuda/bin), so the
|
||||
# first call silently returns nothing → "No GPU" on hosts that DO have GPUs.
|
||||
# Retry through a login shell with the common CUDA bin dirs on PATH.
|
||||
if not out and _remote_host:
|
||||
out = _run(
|
||||
"bash -lc 'export PATH=\"$PATH:/usr/bin:/usr/local/bin:/usr/local/cuda/bin\"; "
|
||||
"nvidia-smi --query-gpu=memory.total,name --format=csv,noheader,nounits'"
|
||||
)
|
||||
# Last resort: call nvidia-smi by absolute path. Some hosts have a login
|
||||
# shell that isn't bash (or a profile that errors), so the bash -lc retry
|
||||
# above still comes back empty even though the binary is right there.
|
||||
if not out and _remote_host:
|
||||
for _p in ("/usr/bin/nvidia-smi", "/usr/local/bin/nvidia-smi", "/usr/local/cuda/bin/nvidia-smi"):
|
||||
out = _run(f"{_p} --query-gpu=memory.total,name --format=csv,noheader,nounits")
|
||||
if out:
|
||||
break
|
||||
if not out:
|
||||
return None
|
||||
|
||||
# nvidia-smi present but unable to talk to the driver (e.g. it was updated
|
||||
# without a reboot). It prints an error and no GPU rows — surface that as a
|
||||
# driver error rather than the misleading "No GPU".
|
||||
_low = out.lower()
|
||||
if ("nvml" in _low or "driver/library version mismatch" in _low
|
||||
or "couldn't communicate" in _low or "no devices were found" in _low
|
||||
or "failed to initialize" in _low):
|
||||
_last_gpu_error = out.strip().split("\n")[0][:140] or "NVIDIA driver error"
|
||||
return None
|
||||
|
||||
gpus = []
|
||||
# nvidia-smi lists GPUs in index order (0,1,2,...), so the row position is
|
||||
# the CUDA device index we'd pass to CUDA_VISIBLE_DEVICES.
|
||||
for idx, line in enumerate(out.strip().split("\n")):
|
||||
parts = [p.strip() for p in line.split(",")]
|
||||
if len(parts) >= 2:
|
||||
try:
|
||||
vram_mb = float(parts[0])
|
||||
gpus.append({"index": idx, "name": parts[1], "vram_gb": vram_mb / 1024.0})
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
if not gpus:
|
||||
return None
|
||||
total_vram = sum(g["vram_gb"] for g in gpus)
|
||||
groups = _group_gpus(gpus)
|
||||
return {
|
||||
"gpu_name": gpus[0]["name"],
|
||||
"gpu_vram_gb": round(total_vram, 1),
|
||||
"gpu_count": len(gpus),
|
||||
"gpus": gpus,
|
||||
"gpu_groups": groups,
|
||||
"homogeneous": len(groups) <= 1,
|
||||
"backend": "cuda",
|
||||
}
|
||||
|
||||
|
||||
def _detect_amd():
|
||||
"""Detect AMD GPUs. Handles both discrete cards (with mem_info_vram_total)
|
||||
and APUs / unified-memory SoCs like Strix Halo (which expose
|
||||
mem_info_vis_vram_total instead, or only mem_info_gtt_total)."""
|
||||
def _read(path):
|
||||
if _remote_host:
|
||||
val = _run(["cat", path])
|
||||
return val.strip() if val else None
|
||||
try:
|
||||
with open(path) as f:
|
||||
return f.read().strip()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def _list_drm_cards():
|
||||
if _remote_host:
|
||||
out = _run(["ls", "/sys/class/drm"])
|
||||
if not out:
|
||||
return []
|
||||
return [e for e in out.split() if e.startswith("card") and "-" not in e]
|
||||
try:
|
||||
return [e for e in os.listdir("/sys/class/drm") if e.startswith("card") and "-" not in e]
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
try:
|
||||
cards = []
|
||||
is_apu = False
|
||||
for _cidx, entry in enumerate(_list_drm_cards()):
|
||||
base = f"/sys/class/drm/{entry}/device"
|
||||
vendor = _read(f"{base}/vendor")
|
||||
if vendor != "0x1002":
|
||||
continue
|
||||
# Discrete cards usually report real VRAM in mem_info_vram_total,
|
||||
# while some AMD APUs / Docker views expose a tiny vram_total and
|
||||
# the usable pool in vis_vram_total. Use the larger of those two;
|
||||
# only fall back to GTT if neither VRAM field is available.
|
||||
vram_raw = _read(f"{base}/mem_info_vram_total")
|
||||
vis_raw = _read(f"{base}/mem_info_vis_vram_total")
|
||||
gtt_raw = _read(f"{base}/mem_info_gtt_total")
|
||||
vram_val = int(vram_raw) if vram_raw and vram_raw.isdigit() else 0
|
||||
vis_val = int(vis_raw) if vis_raw and vis_raw.isdigit() else 0
|
||||
gtt_val = int(gtt_raw) if gtt_raw and gtt_raw.isdigit() else 0
|
||||
vram_bytes = max(vram_val, vis_val)
|
||||
if vram_bytes <= 0:
|
||||
vram_bytes = gtt_val
|
||||
if vis_val and vis_val >= vram_val:
|
||||
is_apu = True
|
||||
if vram_bytes <= 0:
|
||||
continue
|
||||
name = _read(f"{base}/product_name") or f"AMD GPU ({entry})"
|
||||
cards.append({"index": _cidx, "name": name, "vram_gb": vram_bytes / (1024**3)})
|
||||
|
||||
if not cards:
|
||||
return None
|
||||
total_vram = sum(c["vram_gb"] for c in cards)
|
||||
groups = _group_gpus(cards)
|
||||
# NOTE: for APUs with BIOS UMA carveout (e.g. Strix Halo), vis_vram_total
|
||||
# is the real usable GPU memory — it's physically backed but reserved
|
||||
# by BIOS so it doesn't appear in /proc/meminfo. Don't cap it at system
|
||||
# RAM: the two pools are separate from the OS's perspective.
|
||||
return {
|
||||
"gpu_name": cards[0]["name"],
|
||||
"gpu_vram_gb": round(total_vram, 1),
|
||||
"gpu_count": len(cards),
|
||||
"gpus": cards,
|
||||
"gpu_groups": groups,
|
||||
"homogeneous": len(groups) <= 1,
|
||||
"backend": "rocm",
|
||||
"unified_memory": is_apu,
|
||||
}
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _read_file(path):
|
||||
"""Read a file, locally or via SSH."""
|
||||
if _remote_host:
|
||||
return _run(["cat", path])
|
||||
try:
|
||||
with open(path) as f:
|
||||
return f.read()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _parse_meminfo():
|
||||
"""Parse /proc/meminfo into a dict of key -> KB values."""
|
||||
text = _read_file("/proc/meminfo")
|
||||
if not text:
|
||||
return {}
|
||||
result = {}
|
||||
for line in text.split("\n"):
|
||||
if ":" in line:
|
||||
key, val = line.split(":", 1)
|
||||
parts = val.strip().split()
|
||||
if parts:
|
||||
try:
|
||||
result[key.strip()] = int(parts[0])
|
||||
except ValueError:
|
||||
pass
|
||||
return result
|
||||
|
||||
|
||||
def _get_ram_gb():
|
||||
meminfo = _parse_meminfo()
|
||||
if "MemTotal" in meminfo:
|
||||
return meminfo["MemTotal"] / (1024**2)
|
||||
|
||||
if not _remote_host:
|
||||
try:
|
||||
pages = os.sysconf("SC_PHYS_PAGES")
|
||||
page_size = os.sysconf("SC_PAGE_SIZE")
|
||||
if pages and page_size:
|
||||
return (pages * page_size) / (1024**3)
|
||||
except Exception:
|
||||
pass
|
||||
return 0.0
|
||||
|
||||
|
||||
def _get_available_ram_gb():
|
||||
meminfo = _parse_meminfo()
|
||||
if "MemAvailable" in meminfo:
|
||||
return meminfo["MemAvailable"] / (1024**2)
|
||||
return _get_ram_gb() * 0.7
|
||||
|
||||
|
||||
def _get_cpu_name():
|
||||
text = _read_file("/proc/cpuinfo")
|
||||
if text:
|
||||
for line in text.split("\n"):
|
||||
if line.startswith("model name"):
|
||||
return line.split(":", 1)[1].strip()
|
||||
|
||||
if not _remote_host:
|
||||
return platform.processor() or "unknown"
|
||||
return "unknown"
|
||||
|
||||
|
||||
def _get_cpu_count():
|
||||
if _remote_host:
|
||||
out = _run(["nproc"])
|
||||
if out:
|
||||
try:
|
||||
return int(out.strip())
|
||||
except ValueError:
|
||||
pass
|
||||
# fallback: count "processor" lines in /proc/cpuinfo
|
||||
text = _read_file("/proc/cpuinfo")
|
||||
if text:
|
||||
return sum(1 for line in text.split("\n") if line.startswith("processor"))
|
||||
return os.cpu_count() or 1
|
||||
|
||||
|
||||
def _detect_windows():
|
||||
"""Detect Windows hardware in a single SSH call using PowerShell."""
|
||||
# Single PowerShell command that gathers all hardware info at once
|
||||
ps_cmd = (
|
||||
"$r = @{}; "
|
||||
"$os = Get-CimInstance Win32_OperatingSystem; "
|
||||
"$r.ram_gb = [math]::Round($os.TotalVisibleMemorySize / 1048576, 1); "
|
||||
"$r.avail_gb = [math]::Round($os.FreePhysicalMemory / 1048576, 1); "
|
||||
"$cpu = Get-CimInstance Win32_Processor | Select-Object -First 1; "
|
||||
"$r.cpu_name = $cpu.Name; "
|
||||
"$r.cpu_cores = (Get-CimInstance Win32_Processor | Measure-Object -Property NumberOfLogicalProcessors -Sum).Sum; "
|
||||
"$r.arch = $cpu.AddressWidth; "
|
||||
# GPU detection via nvidia-smi (fastest) or WMI fallback
|
||||
"try { "
|
||||
" $nv = nvidia-smi --query-gpu=memory.total,name --format=csv,noheader,nounits 2>$null; "
|
||||
" if ($LASTEXITCODE -eq 0 -and $nv) { "
|
||||
" $gpus = @(); "
|
||||
" foreach ($line in $nv -split \"`n\") { "
|
||||
" $p = $line -split ','; "
|
||||
" if ($p.Count -ge 2) { $gpus += @{name=$p[1].Trim(); vram_mb=[double]$p[0].Trim()} } "
|
||||
" }; "
|
||||
" $r.gpu_name = $gpus[0].name; "
|
||||
" $r.gpu_vram_gb = [math]::Round(($gpus | Measure-Object -Property vram_mb -Sum).Sum / 1024, 1); "
|
||||
" $r.gpu_count = $gpus.Count; "
|
||||
" $r.gpu_backend = 'cuda'; "
|
||||
" } "
|
||||
"} catch {}; "
|
||||
"if (-not $r.gpu_name) { "
|
||||
" $wmiGpu = Get-CimInstance Win32_VideoController | Where-Object { $_.AdapterRAM -gt 0 } | Select-Object -First 1; "
|
||||
" if ($wmiGpu) { "
|
||||
" $r.gpu_name = $wmiGpu.Name; "
|
||||
" $r.gpu_vram_gb = [math]::Round($wmiGpu.AdapterRAM / 1073741824, 1); "
|
||||
" $r.gpu_count = 1; "
|
||||
" $r.gpu_backend = 'cpu_x86'; " # WMI doesn't tell us CUDA/ROCm
|
||||
" } "
|
||||
"}; "
|
||||
"$r | ConvertTo-Json -Compress"
|
||||
)
|
||||
out = _run(f'powershell -Command "{ps_cmd}"')
|
||||
if not out:
|
||||
return None
|
||||
import json as _json
|
||||
try:
|
||||
d = _json.loads(out)
|
||||
result = {
|
||||
"total_ram_gb": d.get("ram_gb", 0),
|
||||
"available_ram_gb": d.get("avail_gb", 0),
|
||||
"cpu_cores": d.get("cpu_cores", 1),
|
||||
"cpu_name": d.get("cpu_name", "unknown"),
|
||||
"has_gpu": bool(d.get("gpu_name")),
|
||||
"gpu_name": d.get("gpu_name"),
|
||||
"gpu_vram_gb": d.get("gpu_vram_gb"),
|
||||
"gpu_count": d.get("gpu_count", 0),
|
||||
"backend": d.get("gpu_backend", "cpu_x86"),
|
||||
}
|
||||
# PowerShell only reports aggregate GPU info, not per-card detail, so we
|
||||
# can't tell a mixed box from a uniform one here — assume one homogeneous
|
||||
# pool spanning all reported GPUs (the common Windows case).
|
||||
_n = result["gpu_count"] or 0
|
||||
if result["has_gpu"] and _n > 0:
|
||||
_each = round((result["gpu_vram_gb"] or 0) / _n, 1)
|
||||
result["gpus"] = [
|
||||
{"index": i, "name": result["gpu_name"], "vram_gb": _each} for i in range(_n)
|
||||
]
|
||||
result["gpu_groups"] = [{
|
||||
"name": result["gpu_name"],
|
||||
"vram_each": _each,
|
||||
"count": _n,
|
||||
"indices": list(range(_n)),
|
||||
"vram_total": result["gpu_vram_gb"],
|
||||
}]
|
||||
result["homogeneous"] = True
|
||||
return result
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
_cache_by_host = {} # host -> (timestamp, result)
|
||||
|
||||
|
||||
def detect_system(host="", ssh_port="", platform="", fresh=False):
|
||||
"""Detect system hardware: RAM, CPU, GPU. Cached per host (hardware rarely
|
||||
changes, and probing a remote host over SSH is slow). Pass fresh=True to
|
||||
bypass the cache and re-probe (the "Rescan" button).
|
||||
If host is set (e.g. 'user@server'), runs detection commands over SSH.
|
||||
platform: "windows", "linux", "termux", or "" (auto-detect).
|
||||
"""
|
||||
global _remote_host, _remote_port, _remote_platform
|
||||
|
||||
cache_key = host or "_local"
|
||||
now = time.time()
|
||||
if not fresh and cache_key in _cache_by_host:
|
||||
ts, cached = _cache_by_host[cache_key]
|
||||
if (now - ts) < CACHE_TTL:
|
||||
return cached
|
||||
|
||||
_remote_host = host or None
|
||||
_remote_port = ssh_port or None
|
||||
_remote_platform = platform or None
|
||||
|
||||
# Windows: single PowerShell command for all hardware info
|
||||
if _remote_platform == "windows" and _remote_host:
|
||||
result = _detect_windows()
|
||||
if result:
|
||||
_remote_host = None
|
||||
_remote_platform = None
|
||||
_cache_by_host[cache_key] = (now, result)
|
||||
return result
|
||||
# If Windows detection failed, return error
|
||||
result = {"error": f"Cannot connect to {host}", "host": host}
|
||||
_remote_host = None
|
||||
_remote_platform = None
|
||||
_cache_by_host[cache_key] = (now, result)
|
||||
return result
|
||||
|
||||
# Linux/Termux: existing multi-command detection
|
||||
total_ram = round(_get_ram_gb(), 1)
|
||||
# If remote host returns 0 RAM, connection likely failed
|
||||
if _remote_host and total_ram <= 0:
|
||||
result = {"error": f"Cannot connect to {host}", "host": host}
|
||||
_cache_by_host[cache_key] = (now, result)
|
||||
_remote_host = None
|
||||
_remote_platform = None
|
||||
return result
|
||||
available_ram = round(_get_available_ram_gb(), 1)
|
||||
cpu_cores = _get_cpu_count()
|
||||
cpu_name = _get_cpu_name()
|
||||
|
||||
gpu_info = _detect_nvidia() or _detect_amd()
|
||||
|
||||
if gpu_info:
|
||||
result = {
|
||||
"total_ram_gb": total_ram,
|
||||
"available_ram_gb": available_ram,
|
||||
"cpu_cores": cpu_cores,
|
||||
"cpu_name": cpu_name,
|
||||
"has_gpu": True,
|
||||
"gpu_name": gpu_info["gpu_name"],
|
||||
"gpu_vram_gb": gpu_info["gpu_vram_gb"],
|
||||
"gpu_count": gpu_info["gpu_count"],
|
||||
"gpus": gpu_info.get("gpus", []),
|
||||
"gpu_groups": gpu_info.get("gpu_groups", []),
|
||||
"homogeneous": gpu_info.get("homogeneous", True),
|
||||
"backend": gpu_info["backend"],
|
||||
}
|
||||
else:
|
||||
if _remote_host:
|
||||
arch_out = _run(["uname", "-m"]) or ""
|
||||
else:
|
||||
import platform as _platform
|
||||
arch_out = _platform.machine().lower()
|
||||
backend = "cpu_arm" if "aarch64" in arch_out or "arm" in arch_out else "cpu_x86"
|
||||
result = {
|
||||
"total_ram_gb": total_ram,
|
||||
"available_ram_gb": available_ram,
|
||||
"cpu_cores": cpu_cores,
|
||||
"cpu_name": cpu_name,
|
||||
"has_gpu": False,
|
||||
"gpu_name": None,
|
||||
"gpu_vram_gb": None,
|
||||
"gpu_count": 0,
|
||||
"backend": backend,
|
||||
# Set when nvidia-smi exists but failed (e.g. driver/library
|
||||
# version mismatch) — lets the UI say "GPU driver error" instead
|
||||
# of the misleading "No GPU".
|
||||
"gpu_error": _last_gpu_error,
|
||||
}
|
||||
|
||||
_remote_host = None
|
||||
_remote_platform = None
|
||||
_cache_by_host[cache_key] = (now, result)
|
||||
return result
|
||||
@@ -0,0 +1,374 @@
|
||||
"""Image generation model registry and VRAM fitting for Cookbook."""
|
||||
|
||||
# Curated registry of image generation models supported by diffusers.
|
||||
# ONLY verified HuggingFace repo IDs.
|
||||
# VRAM estimates are for inference (single image generation).
|
||||
IMAGE_MODEL_REGISTRY = [
|
||||
# ── Z-Image (Alibaba Tongyi) ──
|
||||
{
|
||||
"id": "Tongyi-MAI/Z-Image-Turbo",
|
||||
"name": "Z-Image Turbo",
|
||||
"provider": "Tongyi",
|
||||
"params_b": 6.0,
|
||||
"vram_bf16": 19.0,
|
||||
"vram_fp8": 10.0,
|
||||
"vram_q4": 6.0,
|
||||
"default_quant": "BF16",
|
||||
"quant_repos": {
|
||||
"FP8": "drbaph/Z-Image-Turbo-FP8",
|
||||
},
|
||||
"capabilities": ["text-to-image"],
|
||||
"description": "6B distilled, 8-step. Sub-second on H800. Apache 2.0.",
|
||||
"quality": 92,
|
||||
"speed": 95,
|
||||
"released": "2025-12",
|
||||
},
|
||||
{
|
||||
"id": "Tongyi-MAI/Z-Image",
|
||||
"name": "Z-Image",
|
||||
"provider": "Tongyi",
|
||||
"params_b": 6.0,
|
||||
"vram_bf16": 19.0,
|
||||
"vram_fp8": 10.0,
|
||||
"vram_q4": 6.0,
|
||||
"default_quant": "BF16",
|
||||
"quant_repos": {
|
||||
"FP8": "drbaph/Z-Image-fp8",
|
||||
},
|
||||
"capabilities": ["text-to-image"],
|
||||
"description": "Full undistilled model. Highest creative freedom. Apache 2.0.",
|
||||
"quality": 93,
|
||||
"speed": 70,
|
||||
"released": "2025-12",
|
||||
},
|
||||
# ── Qwen Image ──
|
||||
{
|
||||
"id": "Qwen/Qwen-Image-2512",
|
||||
"name": "Qwen Image 2512",
|
||||
"provider": "Qwen",
|
||||
"params_b": 20.0,
|
||||
"vram_bf16": 42.0,
|
||||
"vram_fp8": 22.0,
|
||||
"vram_q4": 14.0,
|
||||
"default_quant": "FP8",
|
||||
"quant_repos": {},
|
||||
"capabilities": ["text-to-image", "text-rendering"],
|
||||
"description": "Dec 2025 update. Better humans, finer detail, strong text. Apache 2.0.",
|
||||
"quality": 95,
|
||||
"speed": 50,
|
||||
"released": "2025-12",
|
||||
},
|
||||
{
|
||||
"id": "Qwen/Qwen-Image",
|
||||
"name": "Qwen Image",
|
||||
"provider": "Qwen",
|
||||
"params_b": 20.0,
|
||||
"vram_bf16": 42.0,
|
||||
"vram_fp8": 22.0,
|
||||
"vram_q4": 14.0,
|
||||
"default_quant": "FP8",
|
||||
"quant_repos": {},
|
||||
"capabilities": ["text-to-image", "text-rendering"],
|
||||
"description": "20B foundation. Best text rendering in images. Apache 2.0.",
|
||||
"quality": 94,
|
||||
"speed": 50,
|
||||
"released": "2025-08",
|
||||
},
|
||||
{
|
||||
"id": "Qwen/Qwen-Image-Edit-2511",
|
||||
"name": "Qwen Image Edit",
|
||||
"provider": "Qwen",
|
||||
"params_b": 20.0,
|
||||
"vram_bf16": 42.0,
|
||||
"vram_fp8": 22.0,
|
||||
"vram_q4": 14.0,
|
||||
"default_quant": "FP8",
|
||||
"quant_repos": {},
|
||||
"capabilities": ["image-editing", "inpainting"],
|
||||
"description": "Dedicated editing. Style transfer, object removal. Apache 2.0.",
|
||||
"quality": 92,
|
||||
"speed": 50,
|
||||
"released": "2025-11",
|
||||
},
|
||||
# ── Stable Diffusion (dedicated inpainting) ──
|
||||
{
|
||||
"id": "diffusers/stable-diffusion-xl-1.0-inpainting-0.1",
|
||||
"name": "SDXL Inpainting",
|
||||
"provider": "Stability AI",
|
||||
"params_b": 3.5,
|
||||
"vram_bf16": 12.0,
|
||||
"vram_fp8": 8.0,
|
||||
"vram_q4": 6.0,
|
||||
"default_quant": "BF16",
|
||||
"quant_repos": {},
|
||||
"capabilities": ["inpainting", "image-editing"],
|
||||
"description": "SDXL fine-tuned for inpainting (9-channel UNet). Best SD-family fill quality; fits a 24GB card comfortably.",
|
||||
"quality": 86,
|
||||
"speed": 68,
|
||||
"released": "2023-11",
|
||||
},
|
||||
{
|
||||
"id": "stable-diffusion-v1-5/stable-diffusion-inpainting",
|
||||
"name": "SD 1.5 Inpainting",
|
||||
"provider": "Stability AI",
|
||||
"params_b": 1.1,
|
||||
"vram_bf16": 4.0,
|
||||
"vram_fp8": 3.0,
|
||||
"vram_q4": 2.5,
|
||||
"default_quant": "BF16",
|
||||
"quant_repos": {},
|
||||
"capabilities": ["inpainting"],
|
||||
"description": "Classic SD 1.5 inpaint. Very light and fast; lower fidelity than SDXL.",
|
||||
"quality": 70,
|
||||
"speed": 92,
|
||||
"released": "2022-10",
|
||||
},
|
||||
# ── FLUX ──
|
||||
{
|
||||
"id": "black-forest-labs/FLUX.1-dev",
|
||||
"name": "FLUX.1 Dev",
|
||||
"provider": "Black Forest Labs",
|
||||
"params_b": 12.0,
|
||||
"vram_bf16": 33.0,
|
||||
"vram_fp8": 17.0,
|
||||
"vram_q4": 10.0,
|
||||
"default_quant": "FP8",
|
||||
"quant_repos": {
|
||||
"FP8": "diffusers/FLUX.1-dev-torchao-fp8",
|
||||
},
|
||||
"capabilities": ["text-to-image"],
|
||||
"description": "High quality, detailed. Popular community model. Non-commercial.",
|
||||
"quality": 92,
|
||||
"speed": 55,
|
||||
"released": "2024-08",
|
||||
},
|
||||
{
|
||||
"id": "black-forest-labs/FLUX.1-schnell",
|
||||
"name": "FLUX.1 Schnell",
|
||||
"provider": "Black Forest Labs",
|
||||
"params_b": 12.0,
|
||||
"vram_bf16": 33.0,
|
||||
"vram_fp8": 17.0,
|
||||
"vram_q4": 10.0,
|
||||
"default_quant": "FP8",
|
||||
"quant_repos": {
|
||||
"FP8": "Kijai/flux-fp8",
|
||||
},
|
||||
"capabilities": ["text-to-image"],
|
||||
"description": "Fast 4-step variant. Apache 2.0 license.",
|
||||
"quality": 85,
|
||||
"speed": 90,
|
||||
"released": "2024-08",
|
||||
},
|
||||
# ── Stable Diffusion ──
|
||||
{
|
||||
"id": "stabilityai/stable-diffusion-3.5-medium",
|
||||
"name": "SD 3.5 Medium",
|
||||
"provider": "Stability AI",
|
||||
"params_b": 2.5,
|
||||
"vram_bf16": 12.0,
|
||||
"vram_fp8": 7.0,
|
||||
"vram_q4": None,
|
||||
"default_quant": "BF16",
|
||||
"quant_repos": {
|
||||
"FP8": "Comfy-Org/stable-diffusion-3.5-fp8",
|
||||
},
|
||||
"capabilities": ["text-to-image"],
|
||||
"description": "2.5B lightweight, fast. Fits almost any GPU.",
|
||||
"quality": 75,
|
||||
"speed": 95,
|
||||
"released": "2024-10",
|
||||
},
|
||||
{
|
||||
"id": "stabilityai/stable-diffusion-3.5-large",
|
||||
"name": "SD 3.5 Large",
|
||||
"provider": "Stability AI",
|
||||
"params_b": 8.1,
|
||||
"vram_bf16": 22.0,
|
||||
"vram_fp8": 12.0,
|
||||
"vram_q4": None,
|
||||
"default_quant": "BF16",
|
||||
"quant_repos": {
|
||||
"FP8": "Comfy-Org/stable-diffusion-3.5-fp8",
|
||||
},
|
||||
"capabilities": ["text-to-image"],
|
||||
"description": "8B high quality. Good balance of speed and quality.",
|
||||
"quality": 85,
|
||||
"speed": 70,
|
||||
"released": "2024-10",
|
||||
},
|
||||
{
|
||||
"id": "stabilityai/stable-diffusion-3.5-large-turbo",
|
||||
"name": "SD 3.5 Large Turbo",
|
||||
"provider": "Stability AI",
|
||||
"params_b": 8.1,
|
||||
"vram_bf16": 22.0,
|
||||
"vram_fp8": 12.0,
|
||||
"vram_q4": None,
|
||||
"default_quant": "BF16",
|
||||
"quant_repos": {
|
||||
"FP8": "Comfy-Org/stable-diffusion-3.5-fp8",
|
||||
},
|
||||
"capabilities": ["text-to-image"],
|
||||
"description": "Distilled for few-step inference. Fastest large SD.",
|
||||
"quality": 80,
|
||||
"speed": 92,
|
||||
"released": "2024-10",
|
||||
},
|
||||
{
|
||||
"id": "stabilityai/stable-diffusion-xl-base-1.0",
|
||||
"name": "SDXL",
|
||||
"provider": "Stability AI",
|
||||
"params_b": 3.5,
|
||||
"vram_bf16": 10.0,
|
||||
"vram_fp8": 6.0,
|
||||
"vram_q4": None,
|
||||
"default_quant": "BF16",
|
||||
"quant_repos": {},
|
||||
"capabilities": ["text-to-image"],
|
||||
"description": "Classic workhorse. Huge LoRA ecosystem. Fits 8GB+.",
|
||||
"quality": 72,
|
||||
"speed": 90,
|
||||
"released": "2023-07",
|
||||
},
|
||||
# ── Hunyuan ──
|
||||
{
|
||||
"id": "tencent/HunyuanImage-3.0",
|
||||
"name": "HunyuanImage 3.0",
|
||||
"provider": "Tencent",
|
||||
"params_b": 13.0,
|
||||
"vram_bf16": 30.0,
|
||||
"vram_fp8": 16.0,
|
||||
"vram_q4": 9.0,
|
||||
"default_quant": "FP8",
|
||||
"quant_repos": {
|
||||
"Q4": "wikeeyang/Hunyuan-Image-30-Qint4",
|
||||
"NF4": "EricRollei/HunyuanImage-3.0-Instruct-NF4",
|
||||
},
|
||||
"capabilities": ["text-to-image", "text-rendering"],
|
||||
"description": "Strong text rendering. Bilingual Chinese/English. 13B activated per token.",
|
||||
"quality": 88,
|
||||
"speed": 60,
|
||||
"released": "2025-09",
|
||||
},
|
||||
{
|
||||
"id": "tencent/HunyuanImage-3.0-Instruct-Distil",
|
||||
"name": "HunyuanImage 3.0 Distil",
|
||||
"provider": "Tencent",
|
||||
"params_b": 13.0,
|
||||
"vram_bf16": 30.0,
|
||||
"vram_fp8": 16.0,
|
||||
"vram_q4": 9.0,
|
||||
"default_quant": "FP8",
|
||||
"quant_repos": {},
|
||||
"capabilities": ["text-to-image", "text-rendering"],
|
||||
"description": "Distilled variant, fewer steps. Faster with comparable quality.",
|
||||
"quality": 85,
|
||||
"speed": 80,
|
||||
"released": "2026-01",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def get_image_models():
|
||||
"""Return the image model registry."""
|
||||
return IMAGE_MODEL_REGISTRY
|
||||
|
||||
|
||||
def rank_image_models(system, search=None, sort="fit"):
|
||||
"""Score and rank image models against detected hardware.
|
||||
|
||||
Returns list of models with fit info (vram needed, fits, recommended quant).
|
||||
"""
|
||||
gpu_vram = system.get("gpu_vram_gb", 0) or 0
|
||||
has_gpu = system.get("has_gpu", False)
|
||||
results = []
|
||||
|
||||
for model in IMAGE_MODEL_REGISTRY:
|
||||
# Filter by search
|
||||
if search:
|
||||
s = search.lower()
|
||||
if s not in model["name"].lower() and s not in model["id"].lower() and s not in model.get("description", "").lower():
|
||||
continue
|
||||
|
||||
# Determine best quant that fits
|
||||
quant = None
|
||||
vram_needed = None
|
||||
fits = False
|
||||
quant_repo = None
|
||||
|
||||
if has_gpu and gpu_vram > 0:
|
||||
# Try BF16 first, then FP8, then Q4
|
||||
for q, vram_key in [("BF16", "vram_bf16"), ("FP8", "vram_fp8"), ("Q4", "vram_q4")]:
|
||||
v = model.get(vram_key)
|
||||
if v is not None and v <= gpu_vram * 0.90: # 10% headroom
|
||||
quant = q
|
||||
vram_needed = v
|
||||
fits = True
|
||||
quant_repo = model.get("quant_repos", {}).get(q)
|
||||
break
|
||||
# If nothing fits, show what it needs
|
||||
if not fits:
|
||||
quant = model["default_quant"]
|
||||
vram_needed = model.get("vram_bf16", 0)
|
||||
|
||||
# Fit label
|
||||
if not has_gpu:
|
||||
fit = "no_gpu"
|
||||
fit_label = "No GPU"
|
||||
elif fits:
|
||||
headroom = gpu_vram - vram_needed
|
||||
if headroom > gpu_vram * 0.3:
|
||||
fit = "perfect"
|
||||
fit_label = "Perfect"
|
||||
elif headroom > gpu_vram * 0.1:
|
||||
fit = "good"
|
||||
fit_label = "Good"
|
||||
else:
|
||||
fit = "tight"
|
||||
fit_label = "Tight"
|
||||
else:
|
||||
fit = "no_fit"
|
||||
fit_label = "Too large"
|
||||
|
||||
# Score: quality * speed * fit bonus
|
||||
score = model["quality"] * 0.6 + model["speed"] * 0.2
|
||||
if fit == "perfect":
|
||||
score += 20
|
||||
elif fit == "good":
|
||||
score += 10
|
||||
elif fit == "tight":
|
||||
score += 5
|
||||
elif fit == "no_fit":
|
||||
score -= 30
|
||||
|
||||
results.append({
|
||||
"id": model["id"],
|
||||
"name": model["name"],
|
||||
"provider": model["provider"],
|
||||
"params_b": model["params_b"],
|
||||
"vram_needed": vram_needed,
|
||||
"quant": quant,
|
||||
"quant_repo": quant_repo,
|
||||
"fits": fits,
|
||||
"fit": fit,
|
||||
"fit_label": fit_label,
|
||||
"quality": model["quality"],
|
||||
"speed": model["speed"],
|
||||
"score": round(score, 1),
|
||||
"capabilities": model["capabilities"],
|
||||
"description": model["description"],
|
||||
"released": model.get("released", ""),
|
||||
})
|
||||
|
||||
# Sort
|
||||
if sort == "quality":
|
||||
results.sort(key=lambda x: (-x["quality"], -x["score"]))
|
||||
elif sort == "speed":
|
||||
results.sort(key=lambda x: (-x["speed"], -x["score"]))
|
||||
elif sort == "vram":
|
||||
results.sort(key=lambda x: (x["vram_needed"] or 999, -x["score"]))
|
||||
else: # fit (default)
|
||||
results.sort(key=lambda x: (-x["score"],))
|
||||
|
||||
return results
|
||||
@@ -0,0 +1,177 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
||||
QUANT_HIERARCHY = ["Q8_0", "Q6_K", "Q5_K_M", "Q4_K_M", "Q3_K_M", "Q2_K"]
|
||||
|
||||
QUANT_BPP = {
|
||||
"F32": 4.0, "F16": 2.0, "BF16": 2.0, "FP8": 1.0,
|
||||
"Q8_0": 1.05, "Q6_K": 0.80, "Q5_K_M": 0.68,
|
||||
"Q4_K_M": 0.58, "Q4_0": 0.58, "Q3_K_M": 0.48, "Q2_K": 0.37,
|
||||
"AWQ-4bit": 0.50, "AWQ-8bit": 1.0,
|
||||
"GPTQ-Int4": 0.50, "GPTQ-Int8": 1.0,
|
||||
"mlx-4bit": 0.55, "mlx-8bit": 1.0, "mlx-6bit": 0.75,
|
||||
}
|
||||
|
||||
QUANT_SPEED_MULT = {
|
||||
"F16": 0.6, "BF16": 0.6, "FP8": 0.85,
|
||||
"Q8_0": 0.8, "Q6_K": 0.95, "Q5_K_M": 1.0,
|
||||
"Q4_K_M": 1.15, "Q4_0": 1.15, "Q3_K_M": 1.25, "Q2_K": 1.35,
|
||||
"AWQ-4bit": 1.2, "AWQ-8bit": 0.85,
|
||||
"GPTQ-Int4": 1.2, "GPTQ-Int8": 0.85,
|
||||
"mlx-4bit": 1.15, "mlx-8bit": 0.85, "mlx-6bit": 1.0,
|
||||
}
|
||||
|
||||
QUANT_QUALITY_PENALTY = {
|
||||
"F16": 0.0, "BF16": 0.0, "FP8": 0.0,
|
||||
"Q8_0": 0.0, "Q6_K": -1.0, "Q5_K_M": -2.0,
|
||||
"Q4_K_M": -5.0, "Q4_0": -5.0, "Q3_K_M": -8.0, "Q2_K": -12.0,
|
||||
"AWQ-4bit": -3.0, "AWQ-8bit": 0.0,
|
||||
"GPTQ-Int4": -3.0, "GPTQ-Int8": 0.0,
|
||||
"mlx-4bit": -4.0, "mlx-8bit": 0.0, "mlx-6bit": -1.0,
|
||||
}
|
||||
|
||||
QUANT_BYTES_PER_PARAM = {
|
||||
"F16": 2.0, "BF16": 2.0, "FP8": 1.0,
|
||||
"Q8_0": 1.0, "Q6_K": 0.75, "Q5_K_M": 0.625,
|
||||
"Q4_K_M": 0.5, "Q4_0": 0.5, "Q3_K_M": 0.375, "Q2_K": 0.25,
|
||||
"AWQ-4bit": 0.5, "AWQ-8bit": 1.0,
|
||||
"GPTQ-Int4": 0.5, "GPTQ-Int8": 1.0,
|
||||
"mlx-4bit": 0.5, "mlx-8bit": 1.0, "mlx-6bit": 0.75,
|
||||
}
|
||||
|
||||
# Pre-quantized formats that should NOT go through the GGUF quant hierarchy
|
||||
PREQUANTIZED_PREFIXES = ("AWQ-", "GPTQ-", "mlx-", "FP8")
|
||||
|
||||
|
||||
def is_prequantized(model):
|
||||
q = model.get("quantization", "")
|
||||
return any(q.startswith(p) for p in PREQUANTIZED_PREFIXES)
|
||||
|
||||
|
||||
def params_b(model):
|
||||
raw = model.get("parameters_raw")
|
||||
if raw and raw > 0:
|
||||
return raw / 1_000_000_000.0
|
||||
|
||||
pc = model.get("parameter_count", "")
|
||||
if pc:
|
||||
pc = pc.strip().upper()
|
||||
m = re.match(r"^([\d.]+)\s*([BKMGT]?)$", pc)
|
||||
if m:
|
||||
val = float(m.group(1))
|
||||
suffix = m.group(2)
|
||||
if suffix == "B":
|
||||
return val
|
||||
elif suffix == "M":
|
||||
return val / 1000.0
|
||||
elif suffix == "K":
|
||||
return val / 1_000_000.0
|
||||
elif suffix == "T":
|
||||
return val * 1000.0
|
||||
else:
|
||||
# No unit. A bare number this size is conventionally a millions
|
||||
# count (e.g. "355" = 355M), NOT billions — otherwise a 355M
|
||||
# model would sort as 355B and leap above every 7B/70B model.
|
||||
# A genuine billions figure carries a "B" suffix and is handled
|
||||
# above; very large bare values are raw parameter counts.
|
||||
if val >= 1_000_000:
|
||||
return val / 1_000_000_000.0 # raw count
|
||||
if val >= 1000:
|
||||
return val / 1000.0 # thousands of millions? treat as millions
|
||||
return val / 1000.0 # e.g. "355" → 0.355B
|
||||
return 0.0
|
||||
|
||||
|
||||
def estimate_memory_gb(model, quant, ctx):
|
||||
"""Estimate VRAM needed to serve a model. All weights must be loaded,
|
||||
even for MoE (all experts live in memory, only active ones compute per token).
|
||||
KV cache scales with active params for MoE (only active experts have KV state)."""
|
||||
pb = params_b(model)
|
||||
bpp = QUANT_BPP.get(quant, 0.58)
|
||||
kv_params = _active_params_b(model)
|
||||
return pb * bpp + 0.000008 * kv_params * ctx + 0.5
|
||||
|
||||
|
||||
def _active_params_b(model):
|
||||
"""For MoE: active params per token (affects KV cache and speed, not total VRAM).
|
||||
For dense: same as total params."""
|
||||
if model.get("is_moe") and model.get("active_parameters"):
|
||||
return model["active_parameters"] / 1_000_000_000.0
|
||||
return params_b(model)
|
||||
|
||||
|
||||
def best_quant_for_budget(model, budget_gb, ctx):
|
||||
"""Find best quant that fits in budget_gb of VRAM.
|
||||
Pre-quantized models (AWQ/GPTQ/MLX) use their native quant only.
|
||||
Returns (quant, ctx, mem_gb) or (None, None, None).
|
||||
"""
|
||||
if is_prequantized(model):
|
||||
q = model.get("quantization", "Q4_K_M")
|
||||
mem = estimate_memory_gb(model, q, ctx)
|
||||
if mem <= budget_gb:
|
||||
return q, ctx, mem
|
||||
# Try halving context
|
||||
cur_ctx = ctx // 2
|
||||
while cur_ctx >= 1024:
|
||||
mem = estimate_memory_gb(model, q, cur_ctx)
|
||||
if mem <= budget_gb:
|
||||
return q, cur_ctx, mem
|
||||
cur_ctx //= 2
|
||||
return None, None, None
|
||||
|
||||
# GGUF: try best quality first, then fall back
|
||||
for q in QUANT_HIERARCHY:
|
||||
mem = estimate_memory_gb(model, q, ctx)
|
||||
if mem <= budget_gb:
|
||||
return q, ctx, mem
|
||||
|
||||
cur_ctx = ctx // 2
|
||||
while cur_ctx >= 1024:
|
||||
for q in QUANT_HIERARCHY:
|
||||
mem = estimate_memory_gb(model, q, cur_ctx)
|
||||
if mem <= budget_gb:
|
||||
return q, cur_ctx, mem
|
||||
cur_ctx //= 2
|
||||
|
||||
return None, None, None
|
||||
|
||||
|
||||
def infer_use_case(model):
|
||||
name = model.get("name", "").lower()
|
||||
uc = model.get("use_case", "").lower()
|
||||
combined = name + " " + uc
|
||||
|
||||
if any(k in combined for k in ("embedding", "embed", "bge")):
|
||||
return "embedding"
|
||||
if any(k in combined for k in ("tts", "text-to-speech", "speech-synthesis", "cosyvoice", "parler")):
|
||||
return "tts"
|
||||
if any(k in combined for k in ("stt", "speech-to-text", "whisper", "transcri", "asr")):
|
||||
return "stt"
|
||||
if "code" in combined:
|
||||
return "coding"
|
||||
if any(k in combined for k in ("vision", "multimodal", "vlm", "vl-")):
|
||||
return "multimodal"
|
||||
if any(k in combined for k in ("reason", "chain-of-thought", "deepseek-r1")):
|
||||
return "reasoning"
|
||||
if any(k in combined for k in ("chat", "instruction")):
|
||||
return "chat"
|
||||
return "general"
|
||||
|
||||
|
||||
_models_cache = None
|
||||
|
||||
def get_models():
|
||||
global _models_cache
|
||||
if _models_cache is None:
|
||||
data_path = os.path.join(os.path.dirname(__file__), "data", "hf_models.json")
|
||||
try:
|
||||
with open(data_path) as f:
|
||||
_models_cache = json.load(f)
|
||||
except (FileNotFoundError, json.JSONDecodeError):
|
||||
_models_cache = []
|
||||
return _models_cache
|
||||
|
||||
|
||||
def model_catalog_path():
|
||||
return os.path.join(os.path.dirname(__file__), "data", "hf_models.json")
|
||||
Reference in New Issue
Block a user