Odysseus v1.0

This commit is contained in:
pewdiepie-archdaemon
2026-05-31 23:58:26 +09:00
commit e5c99a5eee
421 changed files with 271349 additions and 0 deletions
View File
File diff suppressed because it is too large Load Diff
+463
View File
@@ -0,0 +1,463 @@
import re
from services.hwfit.models import (
params_b, estimate_memory_gb, infer_use_case,
get_models, is_prequantized, _active_params_b, QUANT_BYTES_PER_PARAM,
QUANT_SPEED_MULT, QUANT_QUALITY_PENALTY,
)
GPU_BANDWIDTH = {
"5090": 1792, "5080": 960, "5070 ti": 896, "5070": 672, "5060 ti": 448, "5060": 256,
"4090": 1008, "4080 super": 736, "4080": 717, "4070 ti super": 672, "4070 ti": 504, "4070 super": 504, "4070": 504, "4060 ti": 288, "4060": 272,
"3090 ti": 1008, "3090": 936, "3080 ti": 912, "3080": 760, "3070 ti": 608, "3070": 448, "3060 ti": 448, "3060": 360,
"2080 ti": 616, "2080 super": 496, "2080": 448, "2070 super": 448, "2070": 448, "2060 super": 448, "2060": 336,
"1660 ti": 288, "1660 super": 336, "1660": 192, "1650 super": 192, "1650": 128,
"h100 sxm": 3350, "h100": 2039, "h200": 4800, "a100 sxm": 2039, "a100": 1555,
"l40s": 864, "l40": 864, "l4": 300, "a10g": 600, "a10": 600, "t4": 320,
"v100 sxm": 900, "v100": 897, "a6000": 768, "a5000": 768, "a4000": 448,
"7900 xtx": 960, "7900 xt": 800, "7900 gre": 576, "7800 xt": 624, "7700 xt": 432, "7600": 288,
"6950 xt": 576, "6900 xt": 512, "6800 xt": 512, "6800": 512, "6700 xt": 384, "6600 xt": 256, "6600": 224,
"mi300x": 5300, "mi300": 5300, "mi250x": 3277, "mi250": 3277, "mi210": 1638, "mi100": 1229,
"9070 xt": 624, "9070": 488,
}
# Pre-sort keys by length descending for correct substring matching
_BW_KEYS_SORTED = sorted(GPU_BANDWIDTH.keys(), key=len, reverse=True)
FALLBACK_K = {"cuda": 220, "rocm": 180, "cpu_x86": 70, "cpu_arm": 90}
USE_CASE_WEIGHTS = {
"general": (0.45, 0.30, 0.15, 0.10),
"coding": (0.50, 0.20, 0.15, 0.15),
"reasoning": (0.55, 0.15, 0.15, 0.15),
"chat": (0.40, 0.35, 0.15, 0.10),
"multimodal": (0.50, 0.20, 0.15, 0.15),
"embedding": (0.30, 0.40, 0.20, 0.10),
"tts": (0.40, 0.35, 0.15, 0.10),
"stt": (0.40, 0.35, 0.15, 0.10),
}
SPEED_TARGET = {
"general": 40, "coding": 40, "multimodal": 40, "chat": 40,
"reasoning": 25, "embedding": 200, "tts": 40, "stt": 40,
}
CONTEXT_TARGET = {
"general": 4096, "chat": 4096, "coding": 8192,
"reasoning": 8192, "multimodal": 4096, "embedding": 512,
"tts": 2048, "stt": 2048,
}
def _lookup_bandwidth(gpu_name):
if not gpu_name:
return None
gn = gpu_name.lower()
for key in _BW_KEYS_SORTED:
if key in gn:
return GPU_BANDWIDTH[key]
return None
def _estimate_speed(model, quant, run_mode, system):
"""Estimate tok/s. Uses active params for MoE (only active experts run per token)."""
pb = _active_params_b(model)
is_moe = model.get("is_moe", False)
bw = _lookup_bandwidth(system.get("gpu_name"))
backend = system.get("backend", "cpu_x86")
if bw and run_mode in ("gpu", "cpu_offload"):
bpp = QUANT_BYTES_PER_PARAM.get(quant, 0.5)
model_gb = pb * bpp
if model_gb <= 0:
return 0.0
efficiency = 0.55
raw_tps = (bw / model_gb) * efficiency
if run_mode == "cpu_offload":
mode_factor = 0.5
elif is_moe:
mode_factor = 0.8
else:
mode_factor = 1.0
return raw_tps * mode_factor
k = FALLBACK_K.get(backend, 70)
if pb <= 0:
return 0.0
sm = QUANT_SPEED_MULT.get(quant, 1.0)
return k / pb * sm
def _quality_score(model, quant, use_case):
pb = params_b(model)
if pb < 1:
base = 30
elif pb < 3:
base = 45
elif pb < 7:
base = 60
elif pb < 10:
base = 75
elif pb < 20:
base = 82
elif pb < 40:
base = 89
else:
base = 95
name_lower = model.get("name", "").lower()
if "qwen" in name_lower:
base += 2
if "deepseek" in name_lower:
base += 3
if "llama" in name_lower:
base += 2
if "mistral" in name_lower or "mixtral" in name_lower:
base += 1
if "gemma" in name_lower:
base += 1
base += QUANT_QUALITY_PENALTY.get(quant, 0)
model_uc = infer_use_case(model)
if model_uc == "coding" and use_case == "coding":
base += 6
if model_uc == "reasoning" and use_case == "reasoning" and pb >= 13:
base += 5
if model_uc == "multimodal" and use_case == "multimodal":
base += 6
return max(0, min(100, base))
def _speed_score(tps, use_case):
target = SPEED_TARGET.get(use_case, 40)
return max(0, min(100, (tps / target) * 100))
def _fit_score(required, available):
if required > available:
return 0
if available <= 0:
return 0
ratio = required / available
if ratio <= 0.5:
return 60 + (ratio / 0.5) * 40
if ratio <= 0.8:
return 100
if ratio <= 0.9:
return 70
return 50
def _context_score(ctx, use_case):
target = CONTEXT_TARGET.get(use_case, 4096)
if ctx >= target:
return 100
if ctx >= target / 2:
return 70
return 30
def _try_quant_at(model, quant, ctx, gpu_vram, available_ram):
"""Try a specific quant at a given context. Returns (run_mode, quant, ctx, mem) or None."""
mem = estimate_memory_gb(model, quant, ctx)
if gpu_vram > 0 and mem <= gpu_vram:
return "gpu", quant, ctx, mem
if gpu_vram > 0 and mem <= available_ram:
return "cpu_offload", quant, ctx, mem
if gpu_vram <= 0 and mem <= available_ram:
return "cpu_only", quant, ctx, mem
# Try halving context
cur_ctx = ctx // 2
while cur_ctx >= 1024:
mem = estimate_memory_gb(model, quant, cur_ctx)
if gpu_vram > 0 and mem <= gpu_vram:
return "gpu", quant, cur_ctx, mem
if mem <= available_ram:
return ("cpu_offload" if gpu_vram > 0 else "cpu_only"), quant, cur_ctx, mem
cur_ctx //= 2
return None
def _quant_bits(q):
"""Approximate bit-width of a quant label so GGUF quant tiers (Q4/Q8/…) can
be matched against prequantized formats (AWQ 4, AWQ-8bit, FP8, GPTQ-4bit…).
Returns 0 when unknown (caller treats unknown as "don't filter")."""
qu = (q or "").upper().replace("-", "").replace("_", "").replace(" ", "")
# GGUF k-quants + float formats
if qu.startswith("Q8") or "FP8" in qu:
return 8
if qu.startswith("Q4") or qu.startswith("IQ4"):
return 4
if qu.startswith("Q2") or qu.startswith("IQ2"):
return 2
if qu.startswith("Q3") or qu.startswith("IQ3"):
return 3
if qu.startswith("Q5"):
return 5
if qu.startswith("Q6"):
return 6
if qu.startswith("F16") or qu.startswith("BF16") or qu.startswith("F32"):
return 16
# Prequantized formats: pull the bit-width digit (AWQ4 / AWQ4BIT / GPTQ8 / 4BIT / INT8 …)
m = re.search(r"(?:AWQ|GPTQ|MLX|EXL2|BNB|INT|W)(\d{1,2})", qu) or re.search(r"(\d{1,2})BIT", qu)
if m:
b = int(m.group(1))
if 2 <= b <= 16:
return b
return 0
def analyze_model(model, system, target_quant=None):
pb = params_b(model)
if pb <= 0:
return None
use_case = infer_use_case(model)
has_gpu = system.get("has_gpu", False)
gpu_vram = (system.get("gpu_vram_gb") or 0) if has_gpu else 0
gpu_count = system.get("gpu_count", 1) or 1
single_gpu_vram = gpu_vram / gpu_count if gpu_count > 1 else gpu_vram
available_ram = system.get("available_ram_gb", 0)
# When the user has explicitly picked a GPU config (not RAM mode), they want
# to see what runs ON the GPU(s) — not big models that only "fit" by spilling
# most layers to system RAM. Zeroing the offload budget makes _try_quant_at
# take only its GPU branches (fit on VRAM, shrinking context if needed),
# otherwise return None. Fixes "96 GB GPU still lists a 175 GB model".
gpu_only = bool(system.get("gpu_only")) and has_gpu and gpu_vram > 0
eff_ram = 0 if gpu_only else available_ram
is_moe = model.get("is_moe", False)
ctx = model.get("context_length", 4096) or 4096
native_quant = model.get("quantization", "Q4_K_M")
preq = is_prequantized(model)
# GGUF models can't be sharded across GPUs — use single GPU VRAM
is_gguf = bool(model.get("gguf_sources"))
quant_upper = (native_quant or "").upper()
is_gguf_quant = any(quant_upper.startswith(p) for p in ("Q2", "Q3", "Q4", "Q5", "Q6", "Q8", "IQ", "F16", "F32"))
# Single-GPU VRAM only applies to GGUF/dense builds (llama.cpp can't shard
# across GPUs). Prequantized formats (AWQ/GPTQ/FP8) are served sharded by
# vLLM across all GPUs, so they get the FULL multi-GPU VRAM — even when the
# model also lists a GGUF alternate download (gguf_sources).
if (is_gguf or is_gguf_quant) and not preq:
effective_vram = single_gpu_vram
else:
effective_vram = gpu_vram
# Determine which quant to evaluate at
if preq:
# AWQ/GPTQ/FP8/MLX come at a fixed bit-width. If the user picked a
# specific quant tier (e.g. Q8 → 8-bit), only keep prequant models whose
# native bit-width matches — otherwise selecting Q8 would still surface
# AWQ-4bit models, mixing 4- and 8-bit in one view.
if target_quant:
_tb, _nb = _quant_bits(target_quant), _quant_bits(native_quant)
if _tb and _nb and _tb != _nb:
return None
quant_to_try = native_quant
elif target_quant:
# User picked a specific quant
quant_to_try = target_quant
else:
# Default: Q4_K_M (user's stated preference)
quant_to_try = "Q4_K_M"
result = _try_quant_at(model, quant_to_try, ctx, effective_vram, eff_ram)
# If target quant doesn't fit and it's not pre-quantized, try lower quants
if result is None and not preq and target_quant:
from services.hwfit.models import QUANT_HIERARCHY
idx = QUANT_HIERARCHY.index(target_quant) if target_quant in QUANT_HIERARCHY else -1
for q in QUANT_HIERARCHY[idx + 1:]:
result = _try_quant_at(model, q, ctx, effective_vram, eff_ram)
if result:
break
if result is None:
# Model doesn't fit on the user's current hardware. Surface it
# anyway with a "too_tight" badge instead of silently dropping
# it — without this, editing the hardware config to try LARGER
# tiers never revealed the bigger models, because they were
# filtered out before the user could see what would fit. The
# client already knows how to render too_tight (red row).
oversized_required = estimate_memory_gb(model, quant_to_try, ctx)
return {
"name": model.get("name"),
"provider": model.get("provider"),
"parameter_count": model.get("parameter_count"),
"params_b": round(pb, 1),
"is_moe": is_moe,
"use_case": use_case,
"fit_level": "too_tight",
"run_mode": "no_fit",
"quant": quant_to_try,
"context": ctx,
"required_gb": round(oversized_required, 1),
"speed_tps": 0,
"score": 0,
"scores": {"quality": 0, "speed": 0, "fit": 0, "context": 0},
"gguf_sources": model.get("gguf_sources", []),
"context_length": model.get("context_length", 4096),
}
run_mode, quant, fit_ctx, required_gb = result
# Determine fit level
budget = effective_vram if run_mode == "gpu" else available_ram
if required_gb > budget:
return None
if run_mode == "gpu":
rec = model.get("recommended_ram_gb") or required_gb
if rec <= gpu_vram:
fit_level = "perfect"
elif gpu_vram >= required_gb * 1.2:
fit_level = "good"
else:
fit_level = "marginal"
elif run_mode == "cpu_offload":
fit_level = "good" if available_ram >= required_gb * 1.2 else "marginal"
else:
fit_level = "marginal"
tps = _estimate_speed(model, quant, run_mode, system)
q_score = _quality_score(model, quant, use_case)
s_score = _speed_score(tps, use_case)
f_score = _fit_score(required_gb, budget)
c_score = _context_score(fit_ctx, use_case)
wq, ws, wf, wc = USE_CASE_WEIGHTS.get(use_case, (0.45, 0.30, 0.15, 0.10))
composite = q_score * wq + s_score * ws + f_score * wf + c_score * wc
return {
"name": model.get("name"),
"provider": model.get("provider"),
"parameter_count": model.get("parameter_count"),
"params_b": round(pb, 1),
"is_moe": is_moe,
"use_case": use_case,
"fit_level": fit_level,
"run_mode": run_mode,
"quant": quant,
"context": fit_ctx,
"required_gb": round(required_gb, 1),
"speed_tps": round(tps, 1),
"score": round(composite, 1),
"scores": {
"quality": round(q_score, 1),
"speed": round(s_score, 1),
"fit": round(f_score, 1),
"context": round(c_score, 1),
},
"gguf_sources": model.get("gguf_sources", []),
"context_length": model.get("context_length", 4096),
}
SORT_KEYS = {
"score": lambda r: r["score"],
"speed": lambda r: r["speed_tps"],
"vram": lambda r: r["required_gb"],
"params": lambda r: r["params_b"],
"context": lambda r: r["context"],
}
def rank_models(system, use_case=None, limit=50, search=None, sort="score", quant=None):
"""Rank all models against detected hardware. Returns sorted list of fit results."""
models = get_models()
results = []
# Include image gen models only when explicitly filtered
if use_case == "image_gen":
try:
from services.hwfit.image_models import rank_image_models
except ImportError:
rank_image_models = None
if rank_image_models:
img_results = rank_image_models(system, search=search)
else:
img_results = []
for im in img_results:
fit_map = {"perfect": "perfect", "good": "good", "tight": "marginal", "no_fit": "too_tight", "no_gpu": "too_tight"}
results.append({
"name": im["id"],
"provider": im["provider"],
"parameter_count": f"{im['params_b']}B",
"params_b": im["params_b"],
"is_moe": False,
"use_case": "image_gen",
"fit_level": fit_map.get(im["fit"], "too_tight"),
"run_mode": "gpu" if im["fits"] else "no_fit",
"quant": im.get("quant", "BF16"),
"context": 0,
"context_length": 0,
"required_gb": round(im.get("vram_needed") or 0, 1),
"speed_tps": 0,
"score": float(im["score"]),
"scores": {"quality": float(im["quality"]), "speed": float(im["speed"]), "fit": 0, "context": 0},
"gguf_sources": [],
"is_image_gen": True,
"capabilities": im.get("capabilities", []),
"description": im.get("description", ""),
})
if use_case == "image_gen":
sort_fn = SORT_KEYS.get(sort, SORT_KEYS["score"])
results.sort(key=sort_fn, reverse=(sort != "vram"))
return results[:limit]
# If user picked a prequantized format (AWQ/FP8/GPTQ), filter to only those models
filter_native = quant and any(quant.startswith(p) for p in ("AWQ-", "GPTQ-", "FP8"))
# MLX-quantized models only run on Apple Silicon (Metal). Exclude them on
# every other backend (CUDA / ROCm / CPU) so Linux/Windows users don't see
# unrunnable suggestions.
system_backend = (system.get("backend") or "").lower()
apple_silicon = system_backend in ("mps", "metal", "apple")
for m in models:
native_q = m.get("quantization", "")
# Drop MLX models on non-Apple hardware
if not apple_silicon and native_q.startswith("mlx-"):
continue
# Format filter: AWQ tab → only AWQ models, FP8 tab → only FP8 models
if filter_native:
if quant == "FP8" and native_q != "FP8":
continue
if quant.startswith("AWQ") and not native_q.startswith("AWQ"):
continue
if quant.startswith("GPTQ") and not native_q.startswith("GPTQ"):
continue
if search:
name = m.get("name", "").lower()
provider = m.get("provider", "").lower()
if search.lower() not in name and search.lower() not in provider:
continue
result = analyze_model(m, system, target_quant=quant)
if result is None:
continue
if use_case:
model_uc = infer_use_case(m)
if use_case != model_uc and use_case != "general":
continue
results.append(result)
# Pick the visible SET by best fit (score) first, so it stays the same no
# matter which column the user sorts by — otherwise sorting by params would
# truncate to the N biggest models (huge ones that don't even fit) while
# sorting by vram showed the N smallest. Only AFTER choosing the set do we
# order it by the requested column.
results.sort(key=SORT_KEYS["score"], reverse=True)
results = results[:limit]
sort_fn = SORT_KEYS.get(sort, SORT_KEYS["score"])
# vram ascending (smallest first), everything else descending (biggest first)
results.sort(key=sort_fn, reverse=(sort != "vram"))
return results
+457
View File
@@ -0,0 +1,457 @@
import os
import platform
import subprocess
import time
CACHE_TTL = 1800 # 30 min — hardware rarely changes; use the Rescan button to force a re-probe
_remote_host = None # set by detect_system(host=...)
_remote_port = None # set by detect_system(ssh_port=...)
_remote_platform = None # set by detect_system(platform=...): "windows", "linux", "termux"
_last_gpu_error = None # set by _detect_nvidia() when nvidia-smi errors (driver mismatch, etc.)
def _run(cmd):
try:
if _remote_host:
# Run command on remote host via SSH
if isinstance(cmd, list):
cmd_str = " ".join(cmd)
else:
cmd_str = cmd
ssh_cmd = ["ssh", "-o", "ConnectTimeout=5", "-o", "StrictHostKeyChecking=no"]
if _remote_port and _remote_port != "22":
ssh_cmd += ["-p", _remote_port]
ssh_cmd += [_remote_host, cmd_str]
r = subprocess.run(
ssh_cmd,
capture_output=True, text=True, timeout=15,
)
else:
r = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
if r.returncode == 0:
return r.stdout.strip()
except Exception:
pass
return None
def _group_gpus(gpus):
"""Group identical GPUs by (name, rounded VRAM).
vLLM tensor-parallel only works across IDENTICAL GPUs, so a mixed box must
be split into homogeneous pools. Each group carries the device indices so a
serve command can pin CUDA_VISIBLE_DEVICES to exactly one pool. Biggest pool
(by total VRAM) first — that's the sensible auto-default serving target.
"""
groups = {}
order = []
for g in gpus:
key = (g["name"], round(g["vram_gb"]))
if key not in groups:
groups[key] = {
"name": g["name"],
"vram_each": round(g["vram_gb"], 1),
"count": 0,
"indices": [],
}
order.append(key)
groups[key]["count"] += 1
groups[key]["indices"].append(g.get("index"))
out = []
for key in order:
grp = groups[key]
grp["vram_total"] = round(grp["vram_each"] * grp["count"], 1)
out.append(grp)
out.sort(key=lambda x: x["vram_total"], reverse=True)
return out
def _detect_nvidia():
global _last_gpu_error
_last_gpu_error = None
out = _run(["nvidia-smi", "--query-gpu=memory.total,name", "--format=csv,noheader,nounits"])
# Remote fallback: a non-interactive SSH shell often has a minimal PATH
# that omits where nvidia-smi lives (/usr/bin, /usr/local/cuda/bin), so the
# first call silently returns nothing → "No GPU" on hosts that DO have GPUs.
# Retry through a login shell with the common CUDA bin dirs on PATH.
if not out and _remote_host:
out = _run(
"bash -lc 'export PATH=\"$PATH:/usr/bin:/usr/local/bin:/usr/local/cuda/bin\"; "
"nvidia-smi --query-gpu=memory.total,name --format=csv,noheader,nounits'"
)
# Last resort: call nvidia-smi by absolute path. Some hosts have a login
# shell that isn't bash (or a profile that errors), so the bash -lc retry
# above still comes back empty even though the binary is right there.
if not out and _remote_host:
for _p in ("/usr/bin/nvidia-smi", "/usr/local/bin/nvidia-smi", "/usr/local/cuda/bin/nvidia-smi"):
out = _run(f"{_p} --query-gpu=memory.total,name --format=csv,noheader,nounits")
if out:
break
if not out:
return None
# nvidia-smi present but unable to talk to the driver (e.g. it was updated
# without a reboot). It prints an error and no GPU rows — surface that as a
# driver error rather than the misleading "No GPU".
_low = out.lower()
if ("nvml" in _low or "driver/library version mismatch" in _low
or "couldn't communicate" in _low or "no devices were found" in _low
or "failed to initialize" in _low):
_last_gpu_error = out.strip().split("\n")[0][:140] or "NVIDIA driver error"
return None
gpus = []
# nvidia-smi lists GPUs in index order (0,1,2,...), so the row position is
# the CUDA device index we'd pass to CUDA_VISIBLE_DEVICES.
for idx, line in enumerate(out.strip().split("\n")):
parts = [p.strip() for p in line.split(",")]
if len(parts) >= 2:
try:
vram_mb = float(parts[0])
gpus.append({"index": idx, "name": parts[1], "vram_gb": vram_mb / 1024.0})
except ValueError:
continue
if not gpus:
return None
total_vram = sum(g["vram_gb"] for g in gpus)
groups = _group_gpus(gpus)
return {
"gpu_name": gpus[0]["name"],
"gpu_vram_gb": round(total_vram, 1),
"gpu_count": len(gpus),
"gpus": gpus,
"gpu_groups": groups,
"homogeneous": len(groups) <= 1,
"backend": "cuda",
}
def _detect_amd():
"""Detect AMD GPUs. Handles both discrete cards (with mem_info_vram_total)
and APUs / unified-memory SoCs like Strix Halo (which expose
mem_info_vis_vram_total instead, or only mem_info_gtt_total)."""
def _read(path):
if _remote_host:
val = _run(["cat", path])
return val.strip() if val else None
try:
with open(path) as f:
return f.read().strip()
except Exception:
return None
def _list_drm_cards():
if _remote_host:
out = _run(["ls", "/sys/class/drm"])
if not out:
return []
return [e for e in out.split() if e.startswith("card") and "-" not in e]
try:
return [e for e in os.listdir("/sys/class/drm") if e.startswith("card") and "-" not in e]
except Exception:
return []
try:
cards = []
is_apu = False
for _cidx, entry in enumerate(_list_drm_cards()):
base = f"/sys/class/drm/{entry}/device"
vendor = _read(f"{base}/vendor")
if vendor != "0x1002":
continue
# Discrete cards usually report real VRAM in mem_info_vram_total,
# while some AMD APUs / Docker views expose a tiny vram_total and
# the usable pool in vis_vram_total. Use the larger of those two;
# only fall back to GTT if neither VRAM field is available.
vram_raw = _read(f"{base}/mem_info_vram_total")
vis_raw = _read(f"{base}/mem_info_vis_vram_total")
gtt_raw = _read(f"{base}/mem_info_gtt_total")
vram_val = int(vram_raw) if vram_raw and vram_raw.isdigit() else 0
vis_val = int(vis_raw) if vis_raw and vis_raw.isdigit() else 0
gtt_val = int(gtt_raw) if gtt_raw and gtt_raw.isdigit() else 0
vram_bytes = max(vram_val, vis_val)
if vram_bytes <= 0:
vram_bytes = gtt_val
if vis_val and vis_val >= vram_val:
is_apu = True
if vram_bytes <= 0:
continue
name = _read(f"{base}/product_name") or f"AMD GPU ({entry})"
cards.append({"index": _cidx, "name": name, "vram_gb": vram_bytes / (1024**3)})
if not cards:
return None
total_vram = sum(c["vram_gb"] for c in cards)
groups = _group_gpus(cards)
# NOTE: for APUs with BIOS UMA carveout (e.g. Strix Halo), vis_vram_total
# is the real usable GPU memory — it's physically backed but reserved
# by BIOS so it doesn't appear in /proc/meminfo. Don't cap it at system
# RAM: the two pools are separate from the OS's perspective.
return {
"gpu_name": cards[0]["name"],
"gpu_vram_gb": round(total_vram, 1),
"gpu_count": len(cards),
"gpus": cards,
"gpu_groups": groups,
"homogeneous": len(groups) <= 1,
"backend": "rocm",
"unified_memory": is_apu,
}
except Exception:
return None
def _read_file(path):
"""Read a file, locally or via SSH."""
if _remote_host:
return _run(["cat", path])
try:
with open(path) as f:
return f.read()
except Exception:
return None
def _parse_meminfo():
"""Parse /proc/meminfo into a dict of key -> KB values."""
text = _read_file("/proc/meminfo")
if not text:
return {}
result = {}
for line in text.split("\n"):
if ":" in line:
key, val = line.split(":", 1)
parts = val.strip().split()
if parts:
try:
result[key.strip()] = int(parts[0])
except ValueError:
pass
return result
def _get_ram_gb():
meminfo = _parse_meminfo()
if "MemTotal" in meminfo:
return meminfo["MemTotal"] / (1024**2)
if not _remote_host:
try:
pages = os.sysconf("SC_PHYS_PAGES")
page_size = os.sysconf("SC_PAGE_SIZE")
if pages and page_size:
return (pages * page_size) / (1024**3)
except Exception:
pass
return 0.0
def _get_available_ram_gb():
meminfo = _parse_meminfo()
if "MemAvailable" in meminfo:
return meminfo["MemAvailable"] / (1024**2)
return _get_ram_gb() * 0.7
def _get_cpu_name():
text = _read_file("/proc/cpuinfo")
if text:
for line in text.split("\n"):
if line.startswith("model name"):
return line.split(":", 1)[1].strip()
if not _remote_host:
return platform.processor() or "unknown"
return "unknown"
def _get_cpu_count():
if _remote_host:
out = _run(["nproc"])
if out:
try:
return int(out.strip())
except ValueError:
pass
# fallback: count "processor" lines in /proc/cpuinfo
text = _read_file("/proc/cpuinfo")
if text:
return sum(1 for line in text.split("\n") if line.startswith("processor"))
return os.cpu_count() or 1
def _detect_windows():
"""Detect Windows hardware in a single SSH call using PowerShell."""
# Single PowerShell command that gathers all hardware info at once
ps_cmd = (
"$r = @{}; "
"$os = Get-CimInstance Win32_OperatingSystem; "
"$r.ram_gb = [math]::Round($os.TotalVisibleMemorySize / 1048576, 1); "
"$r.avail_gb = [math]::Round($os.FreePhysicalMemory / 1048576, 1); "
"$cpu = Get-CimInstance Win32_Processor | Select-Object -First 1; "
"$r.cpu_name = $cpu.Name; "
"$r.cpu_cores = (Get-CimInstance Win32_Processor | Measure-Object -Property NumberOfLogicalProcessors -Sum).Sum; "
"$r.arch = $cpu.AddressWidth; "
# GPU detection via nvidia-smi (fastest) or WMI fallback
"try { "
" $nv = nvidia-smi --query-gpu=memory.total,name --format=csv,noheader,nounits 2>$null; "
" if ($LASTEXITCODE -eq 0 -and $nv) { "
" $gpus = @(); "
" foreach ($line in $nv -split \"`n\") { "
" $p = $line -split ','; "
" if ($p.Count -ge 2) { $gpus += @{name=$p[1].Trim(); vram_mb=[double]$p[0].Trim()} } "
" }; "
" $r.gpu_name = $gpus[0].name; "
" $r.gpu_vram_gb = [math]::Round(($gpus | Measure-Object -Property vram_mb -Sum).Sum / 1024, 1); "
" $r.gpu_count = $gpus.Count; "
" $r.gpu_backend = 'cuda'; "
" } "
"} catch {}; "
"if (-not $r.gpu_name) { "
" $wmiGpu = Get-CimInstance Win32_VideoController | Where-Object { $_.AdapterRAM -gt 0 } | Select-Object -First 1; "
" if ($wmiGpu) { "
" $r.gpu_name = $wmiGpu.Name; "
" $r.gpu_vram_gb = [math]::Round($wmiGpu.AdapterRAM / 1073741824, 1); "
" $r.gpu_count = 1; "
" $r.gpu_backend = 'cpu_x86'; " # WMI doesn't tell us CUDA/ROCm
" } "
"}; "
"$r | ConvertTo-Json -Compress"
)
out = _run(f'powershell -Command "{ps_cmd}"')
if not out:
return None
import json as _json
try:
d = _json.loads(out)
result = {
"total_ram_gb": d.get("ram_gb", 0),
"available_ram_gb": d.get("avail_gb", 0),
"cpu_cores": d.get("cpu_cores", 1),
"cpu_name": d.get("cpu_name", "unknown"),
"has_gpu": bool(d.get("gpu_name")),
"gpu_name": d.get("gpu_name"),
"gpu_vram_gb": d.get("gpu_vram_gb"),
"gpu_count": d.get("gpu_count", 0),
"backend": d.get("gpu_backend", "cpu_x86"),
}
# PowerShell only reports aggregate GPU info, not per-card detail, so we
# can't tell a mixed box from a uniform one here — assume one homogeneous
# pool spanning all reported GPUs (the common Windows case).
_n = result["gpu_count"] or 0
if result["has_gpu"] and _n > 0:
_each = round((result["gpu_vram_gb"] or 0) / _n, 1)
result["gpus"] = [
{"index": i, "name": result["gpu_name"], "vram_gb": _each} for i in range(_n)
]
result["gpu_groups"] = [{
"name": result["gpu_name"],
"vram_each": _each,
"count": _n,
"indices": list(range(_n)),
"vram_total": result["gpu_vram_gb"],
}]
result["homogeneous"] = True
return result
except Exception:
return None
_cache_by_host = {} # host -> (timestamp, result)
def detect_system(host="", ssh_port="", platform="", fresh=False):
"""Detect system hardware: RAM, CPU, GPU. Cached per host (hardware rarely
changes, and probing a remote host over SSH is slow). Pass fresh=True to
bypass the cache and re-probe (the "Rescan" button).
If host is set (e.g. 'user@server'), runs detection commands over SSH.
platform: "windows", "linux", "termux", or "" (auto-detect).
"""
global _remote_host, _remote_port, _remote_platform
cache_key = host or "_local"
now = time.time()
if not fresh and cache_key in _cache_by_host:
ts, cached = _cache_by_host[cache_key]
if (now - ts) < CACHE_TTL:
return cached
_remote_host = host or None
_remote_port = ssh_port or None
_remote_platform = platform or None
# Windows: single PowerShell command for all hardware info
if _remote_platform == "windows" and _remote_host:
result = _detect_windows()
if result:
_remote_host = None
_remote_platform = None
_cache_by_host[cache_key] = (now, result)
return result
# If Windows detection failed, return error
result = {"error": f"Cannot connect to {host}", "host": host}
_remote_host = None
_remote_platform = None
_cache_by_host[cache_key] = (now, result)
return result
# Linux/Termux: existing multi-command detection
total_ram = round(_get_ram_gb(), 1)
# If remote host returns 0 RAM, connection likely failed
if _remote_host and total_ram <= 0:
result = {"error": f"Cannot connect to {host}", "host": host}
_cache_by_host[cache_key] = (now, result)
_remote_host = None
_remote_platform = None
return result
available_ram = round(_get_available_ram_gb(), 1)
cpu_cores = _get_cpu_count()
cpu_name = _get_cpu_name()
gpu_info = _detect_nvidia() or _detect_amd()
if gpu_info:
result = {
"total_ram_gb": total_ram,
"available_ram_gb": available_ram,
"cpu_cores": cpu_cores,
"cpu_name": cpu_name,
"has_gpu": True,
"gpu_name": gpu_info["gpu_name"],
"gpu_vram_gb": gpu_info["gpu_vram_gb"],
"gpu_count": gpu_info["gpu_count"],
"gpus": gpu_info.get("gpus", []),
"gpu_groups": gpu_info.get("gpu_groups", []),
"homogeneous": gpu_info.get("homogeneous", True),
"backend": gpu_info["backend"],
}
else:
if _remote_host:
arch_out = _run(["uname", "-m"]) or ""
else:
import platform as _platform
arch_out = _platform.machine().lower()
backend = "cpu_arm" if "aarch64" in arch_out or "arm" in arch_out else "cpu_x86"
result = {
"total_ram_gb": total_ram,
"available_ram_gb": available_ram,
"cpu_cores": cpu_cores,
"cpu_name": cpu_name,
"has_gpu": False,
"gpu_name": None,
"gpu_vram_gb": None,
"gpu_count": 0,
"backend": backend,
# Set when nvidia-smi exists but failed (e.g. driver/library
# version mismatch) — lets the UI say "GPU driver error" instead
# of the misleading "No GPU".
"gpu_error": _last_gpu_error,
}
_remote_host = None
_remote_platform = None
_cache_by_host[cache_key] = (now, result)
return result
+374
View File
@@ -0,0 +1,374 @@
"""Image generation model registry and VRAM fitting for Cookbook."""
# Curated registry of image generation models supported by diffusers.
# ONLY verified HuggingFace repo IDs.
# VRAM estimates are for inference (single image generation).
IMAGE_MODEL_REGISTRY = [
# ── Z-Image (Alibaba Tongyi) ──
{
"id": "Tongyi-MAI/Z-Image-Turbo",
"name": "Z-Image Turbo",
"provider": "Tongyi",
"params_b": 6.0,
"vram_bf16": 19.0,
"vram_fp8": 10.0,
"vram_q4": 6.0,
"default_quant": "BF16",
"quant_repos": {
"FP8": "drbaph/Z-Image-Turbo-FP8",
},
"capabilities": ["text-to-image"],
"description": "6B distilled, 8-step. Sub-second on H800. Apache 2.0.",
"quality": 92,
"speed": 95,
"released": "2025-12",
},
{
"id": "Tongyi-MAI/Z-Image",
"name": "Z-Image",
"provider": "Tongyi",
"params_b": 6.0,
"vram_bf16": 19.0,
"vram_fp8": 10.0,
"vram_q4": 6.0,
"default_quant": "BF16",
"quant_repos": {
"FP8": "drbaph/Z-Image-fp8",
},
"capabilities": ["text-to-image"],
"description": "Full undistilled model. Highest creative freedom. Apache 2.0.",
"quality": 93,
"speed": 70,
"released": "2025-12",
},
# ── Qwen Image ──
{
"id": "Qwen/Qwen-Image-2512",
"name": "Qwen Image 2512",
"provider": "Qwen",
"params_b": 20.0,
"vram_bf16": 42.0,
"vram_fp8": 22.0,
"vram_q4": 14.0,
"default_quant": "FP8",
"quant_repos": {},
"capabilities": ["text-to-image", "text-rendering"],
"description": "Dec 2025 update. Better humans, finer detail, strong text. Apache 2.0.",
"quality": 95,
"speed": 50,
"released": "2025-12",
},
{
"id": "Qwen/Qwen-Image",
"name": "Qwen Image",
"provider": "Qwen",
"params_b": 20.0,
"vram_bf16": 42.0,
"vram_fp8": 22.0,
"vram_q4": 14.0,
"default_quant": "FP8",
"quant_repos": {},
"capabilities": ["text-to-image", "text-rendering"],
"description": "20B foundation. Best text rendering in images. Apache 2.0.",
"quality": 94,
"speed": 50,
"released": "2025-08",
},
{
"id": "Qwen/Qwen-Image-Edit-2511",
"name": "Qwen Image Edit",
"provider": "Qwen",
"params_b": 20.0,
"vram_bf16": 42.0,
"vram_fp8": 22.0,
"vram_q4": 14.0,
"default_quant": "FP8",
"quant_repos": {},
"capabilities": ["image-editing", "inpainting"],
"description": "Dedicated editing. Style transfer, object removal. Apache 2.0.",
"quality": 92,
"speed": 50,
"released": "2025-11",
},
# ── Stable Diffusion (dedicated inpainting) ──
{
"id": "diffusers/stable-diffusion-xl-1.0-inpainting-0.1",
"name": "SDXL Inpainting",
"provider": "Stability AI",
"params_b": 3.5,
"vram_bf16": 12.0,
"vram_fp8": 8.0,
"vram_q4": 6.0,
"default_quant": "BF16",
"quant_repos": {},
"capabilities": ["inpainting", "image-editing"],
"description": "SDXL fine-tuned for inpainting (9-channel UNet). Best SD-family fill quality; fits a 24GB card comfortably.",
"quality": 86,
"speed": 68,
"released": "2023-11",
},
{
"id": "stable-diffusion-v1-5/stable-diffusion-inpainting",
"name": "SD 1.5 Inpainting",
"provider": "Stability AI",
"params_b": 1.1,
"vram_bf16": 4.0,
"vram_fp8": 3.0,
"vram_q4": 2.5,
"default_quant": "BF16",
"quant_repos": {},
"capabilities": ["inpainting"],
"description": "Classic SD 1.5 inpaint. Very light and fast; lower fidelity than SDXL.",
"quality": 70,
"speed": 92,
"released": "2022-10",
},
# ── FLUX ──
{
"id": "black-forest-labs/FLUX.1-dev",
"name": "FLUX.1 Dev",
"provider": "Black Forest Labs",
"params_b": 12.0,
"vram_bf16": 33.0,
"vram_fp8": 17.0,
"vram_q4": 10.0,
"default_quant": "FP8",
"quant_repos": {
"FP8": "diffusers/FLUX.1-dev-torchao-fp8",
},
"capabilities": ["text-to-image"],
"description": "High quality, detailed. Popular community model. Non-commercial.",
"quality": 92,
"speed": 55,
"released": "2024-08",
},
{
"id": "black-forest-labs/FLUX.1-schnell",
"name": "FLUX.1 Schnell",
"provider": "Black Forest Labs",
"params_b": 12.0,
"vram_bf16": 33.0,
"vram_fp8": 17.0,
"vram_q4": 10.0,
"default_quant": "FP8",
"quant_repos": {
"FP8": "Kijai/flux-fp8",
},
"capabilities": ["text-to-image"],
"description": "Fast 4-step variant. Apache 2.0 license.",
"quality": 85,
"speed": 90,
"released": "2024-08",
},
# ── Stable Diffusion ──
{
"id": "stabilityai/stable-diffusion-3.5-medium",
"name": "SD 3.5 Medium",
"provider": "Stability AI",
"params_b": 2.5,
"vram_bf16": 12.0,
"vram_fp8": 7.0,
"vram_q4": None,
"default_quant": "BF16",
"quant_repos": {
"FP8": "Comfy-Org/stable-diffusion-3.5-fp8",
},
"capabilities": ["text-to-image"],
"description": "2.5B lightweight, fast. Fits almost any GPU.",
"quality": 75,
"speed": 95,
"released": "2024-10",
},
{
"id": "stabilityai/stable-diffusion-3.5-large",
"name": "SD 3.5 Large",
"provider": "Stability AI",
"params_b": 8.1,
"vram_bf16": 22.0,
"vram_fp8": 12.0,
"vram_q4": None,
"default_quant": "BF16",
"quant_repos": {
"FP8": "Comfy-Org/stable-diffusion-3.5-fp8",
},
"capabilities": ["text-to-image"],
"description": "8B high quality. Good balance of speed and quality.",
"quality": 85,
"speed": 70,
"released": "2024-10",
},
{
"id": "stabilityai/stable-diffusion-3.5-large-turbo",
"name": "SD 3.5 Large Turbo",
"provider": "Stability AI",
"params_b": 8.1,
"vram_bf16": 22.0,
"vram_fp8": 12.0,
"vram_q4": None,
"default_quant": "BF16",
"quant_repos": {
"FP8": "Comfy-Org/stable-diffusion-3.5-fp8",
},
"capabilities": ["text-to-image"],
"description": "Distilled for few-step inference. Fastest large SD.",
"quality": 80,
"speed": 92,
"released": "2024-10",
},
{
"id": "stabilityai/stable-diffusion-xl-base-1.0",
"name": "SDXL",
"provider": "Stability AI",
"params_b": 3.5,
"vram_bf16": 10.0,
"vram_fp8": 6.0,
"vram_q4": None,
"default_quant": "BF16",
"quant_repos": {},
"capabilities": ["text-to-image"],
"description": "Classic workhorse. Huge LoRA ecosystem. Fits 8GB+.",
"quality": 72,
"speed": 90,
"released": "2023-07",
},
# ── Hunyuan ──
{
"id": "tencent/HunyuanImage-3.0",
"name": "HunyuanImage 3.0",
"provider": "Tencent",
"params_b": 13.0,
"vram_bf16": 30.0,
"vram_fp8": 16.0,
"vram_q4": 9.0,
"default_quant": "FP8",
"quant_repos": {
"Q4": "wikeeyang/Hunyuan-Image-30-Qint4",
"NF4": "EricRollei/HunyuanImage-3.0-Instruct-NF4",
},
"capabilities": ["text-to-image", "text-rendering"],
"description": "Strong text rendering. Bilingual Chinese/English. 13B activated per token.",
"quality": 88,
"speed": 60,
"released": "2025-09",
},
{
"id": "tencent/HunyuanImage-3.0-Instruct-Distil",
"name": "HunyuanImage 3.0 Distil",
"provider": "Tencent",
"params_b": 13.0,
"vram_bf16": 30.0,
"vram_fp8": 16.0,
"vram_q4": 9.0,
"default_quant": "FP8",
"quant_repos": {},
"capabilities": ["text-to-image", "text-rendering"],
"description": "Distilled variant, fewer steps. Faster with comparable quality.",
"quality": 85,
"speed": 80,
"released": "2026-01",
},
]
def get_image_models():
"""Return the image model registry."""
return IMAGE_MODEL_REGISTRY
def rank_image_models(system, search=None, sort="fit"):
"""Score and rank image models against detected hardware.
Returns list of models with fit info (vram needed, fits, recommended quant).
"""
gpu_vram = system.get("gpu_vram_gb", 0) or 0
has_gpu = system.get("has_gpu", False)
results = []
for model in IMAGE_MODEL_REGISTRY:
# Filter by search
if search:
s = search.lower()
if s not in model["name"].lower() and s not in model["id"].lower() and s not in model.get("description", "").lower():
continue
# Determine best quant that fits
quant = None
vram_needed = None
fits = False
quant_repo = None
if has_gpu and gpu_vram > 0:
# Try BF16 first, then FP8, then Q4
for q, vram_key in [("BF16", "vram_bf16"), ("FP8", "vram_fp8"), ("Q4", "vram_q4")]:
v = model.get(vram_key)
if v is not None and v <= gpu_vram * 0.90: # 10% headroom
quant = q
vram_needed = v
fits = True
quant_repo = model.get("quant_repos", {}).get(q)
break
# If nothing fits, show what it needs
if not fits:
quant = model["default_quant"]
vram_needed = model.get("vram_bf16", 0)
# Fit label
if not has_gpu:
fit = "no_gpu"
fit_label = "No GPU"
elif fits:
headroom = gpu_vram - vram_needed
if headroom > gpu_vram * 0.3:
fit = "perfect"
fit_label = "Perfect"
elif headroom > gpu_vram * 0.1:
fit = "good"
fit_label = "Good"
else:
fit = "tight"
fit_label = "Tight"
else:
fit = "no_fit"
fit_label = "Too large"
# Score: quality * speed * fit bonus
score = model["quality"] * 0.6 + model["speed"] * 0.2
if fit == "perfect":
score += 20
elif fit == "good":
score += 10
elif fit == "tight":
score += 5
elif fit == "no_fit":
score -= 30
results.append({
"id": model["id"],
"name": model["name"],
"provider": model["provider"],
"params_b": model["params_b"],
"vram_needed": vram_needed,
"quant": quant,
"quant_repo": quant_repo,
"fits": fits,
"fit": fit,
"fit_label": fit_label,
"quality": model["quality"],
"speed": model["speed"],
"score": round(score, 1),
"capabilities": model["capabilities"],
"description": model["description"],
"released": model.get("released", ""),
})
# Sort
if sort == "quality":
results.sort(key=lambda x: (-x["quality"], -x["score"]))
elif sort == "speed":
results.sort(key=lambda x: (-x["speed"], -x["score"]))
elif sort == "vram":
results.sort(key=lambda x: (x["vram_needed"] or 999, -x["score"]))
else: # fit (default)
results.sort(key=lambda x: (-x["score"],))
return results
+177
View File
@@ -0,0 +1,177 @@
import json
import os
import re
QUANT_HIERARCHY = ["Q8_0", "Q6_K", "Q5_K_M", "Q4_K_M", "Q3_K_M", "Q2_K"]
QUANT_BPP = {
"F32": 4.0, "F16": 2.0, "BF16": 2.0, "FP8": 1.0,
"Q8_0": 1.05, "Q6_K": 0.80, "Q5_K_M": 0.68,
"Q4_K_M": 0.58, "Q4_0": 0.58, "Q3_K_M": 0.48, "Q2_K": 0.37,
"AWQ-4bit": 0.50, "AWQ-8bit": 1.0,
"GPTQ-Int4": 0.50, "GPTQ-Int8": 1.0,
"mlx-4bit": 0.55, "mlx-8bit": 1.0, "mlx-6bit": 0.75,
}
QUANT_SPEED_MULT = {
"F16": 0.6, "BF16": 0.6, "FP8": 0.85,
"Q8_0": 0.8, "Q6_K": 0.95, "Q5_K_M": 1.0,
"Q4_K_M": 1.15, "Q4_0": 1.15, "Q3_K_M": 1.25, "Q2_K": 1.35,
"AWQ-4bit": 1.2, "AWQ-8bit": 0.85,
"GPTQ-Int4": 1.2, "GPTQ-Int8": 0.85,
"mlx-4bit": 1.15, "mlx-8bit": 0.85, "mlx-6bit": 1.0,
}
QUANT_QUALITY_PENALTY = {
"F16": 0.0, "BF16": 0.0, "FP8": 0.0,
"Q8_0": 0.0, "Q6_K": -1.0, "Q5_K_M": -2.0,
"Q4_K_M": -5.0, "Q4_0": -5.0, "Q3_K_M": -8.0, "Q2_K": -12.0,
"AWQ-4bit": -3.0, "AWQ-8bit": 0.0,
"GPTQ-Int4": -3.0, "GPTQ-Int8": 0.0,
"mlx-4bit": -4.0, "mlx-8bit": 0.0, "mlx-6bit": -1.0,
}
QUANT_BYTES_PER_PARAM = {
"F16": 2.0, "BF16": 2.0, "FP8": 1.0,
"Q8_0": 1.0, "Q6_K": 0.75, "Q5_K_M": 0.625,
"Q4_K_M": 0.5, "Q4_0": 0.5, "Q3_K_M": 0.375, "Q2_K": 0.25,
"AWQ-4bit": 0.5, "AWQ-8bit": 1.0,
"GPTQ-Int4": 0.5, "GPTQ-Int8": 1.0,
"mlx-4bit": 0.5, "mlx-8bit": 1.0, "mlx-6bit": 0.75,
}
# Pre-quantized formats that should NOT go through the GGUF quant hierarchy
PREQUANTIZED_PREFIXES = ("AWQ-", "GPTQ-", "mlx-", "FP8")
def is_prequantized(model):
q = model.get("quantization", "")
return any(q.startswith(p) for p in PREQUANTIZED_PREFIXES)
def params_b(model):
raw = model.get("parameters_raw")
if raw and raw > 0:
return raw / 1_000_000_000.0
pc = model.get("parameter_count", "")
if pc:
pc = pc.strip().upper()
m = re.match(r"^([\d.]+)\s*([BKMGT]?)$", pc)
if m:
val = float(m.group(1))
suffix = m.group(2)
if suffix == "B":
return val
elif suffix == "M":
return val / 1000.0
elif suffix == "K":
return val / 1_000_000.0
elif suffix == "T":
return val * 1000.0
else:
# No unit. A bare number this size is conventionally a millions
# count (e.g. "355" = 355M), NOT billions — otherwise a 355M
# model would sort as 355B and leap above every 7B/70B model.
# A genuine billions figure carries a "B" suffix and is handled
# above; very large bare values are raw parameter counts.
if val >= 1_000_000:
return val / 1_000_000_000.0 # raw count
if val >= 1000:
return val / 1000.0 # thousands of millions? treat as millions
return val / 1000.0 # e.g. "355" → 0.355B
return 0.0
def estimate_memory_gb(model, quant, ctx):
"""Estimate VRAM needed to serve a model. All weights must be loaded,
even for MoE (all experts live in memory, only active ones compute per token).
KV cache scales with active params for MoE (only active experts have KV state)."""
pb = params_b(model)
bpp = QUANT_BPP.get(quant, 0.58)
kv_params = _active_params_b(model)
return pb * bpp + 0.000008 * kv_params * ctx + 0.5
def _active_params_b(model):
"""For MoE: active params per token (affects KV cache and speed, not total VRAM).
For dense: same as total params."""
if model.get("is_moe") and model.get("active_parameters"):
return model["active_parameters"] / 1_000_000_000.0
return params_b(model)
def best_quant_for_budget(model, budget_gb, ctx):
"""Find best quant that fits in budget_gb of VRAM.
Pre-quantized models (AWQ/GPTQ/MLX) use their native quant only.
Returns (quant, ctx, mem_gb) or (None, None, None).
"""
if is_prequantized(model):
q = model.get("quantization", "Q4_K_M")
mem = estimate_memory_gb(model, q, ctx)
if mem <= budget_gb:
return q, ctx, mem
# Try halving context
cur_ctx = ctx // 2
while cur_ctx >= 1024:
mem = estimate_memory_gb(model, q, cur_ctx)
if mem <= budget_gb:
return q, cur_ctx, mem
cur_ctx //= 2
return None, None, None
# GGUF: try best quality first, then fall back
for q in QUANT_HIERARCHY:
mem = estimate_memory_gb(model, q, ctx)
if mem <= budget_gb:
return q, ctx, mem
cur_ctx = ctx // 2
while cur_ctx >= 1024:
for q in QUANT_HIERARCHY:
mem = estimate_memory_gb(model, q, cur_ctx)
if mem <= budget_gb:
return q, cur_ctx, mem
cur_ctx //= 2
return None, None, None
def infer_use_case(model):
name = model.get("name", "").lower()
uc = model.get("use_case", "").lower()
combined = name + " " + uc
if any(k in combined for k in ("embedding", "embed", "bge")):
return "embedding"
if any(k in combined for k in ("tts", "text-to-speech", "speech-synthesis", "cosyvoice", "parler")):
return "tts"
if any(k in combined for k in ("stt", "speech-to-text", "whisper", "transcri", "asr")):
return "stt"
if "code" in combined:
return "coding"
if any(k in combined for k in ("vision", "multimodal", "vlm", "vl-")):
return "multimodal"
if any(k in combined for k in ("reason", "chain-of-thought", "deepseek-r1")):
return "reasoning"
if any(k in combined for k in ("chat", "instruction")):
return "chat"
return "general"
_models_cache = None
def get_models():
global _models_cache
if _models_cache is None:
data_path = os.path.join(os.path.dirname(__file__), "data", "hf_models.json")
try:
with open(data_path) as f:
_models_cache = json.load(f)
except (FileNotFoundError, json.JSONDecodeError):
_models_cache = []
return _models_cache
def model_catalog_path():
return os.path.join(os.path.dirname(__file__), "data", "hf_models.json")