mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-17 10:15:27 -04:00
eb79b76432
Backend (services/hwfit + routes): - rank_models picks visible set by REQUESTED column, not always score — sorting by Param now shows highest-param models PERIOD (incl. too_tight). - New fit_only param. Multi-GPU rigs filter GGUF Q*/IQ quants (vLLM/SGLang cannot serve them); default non-prequantized to BF16 on 2+ GPUs. - AWQ / GPTQ-8bit get a -1.0 quality penalty (was 0.0, tied with FP8), so FP8 wins when both fit. - Version-aware tiebreaker (parse Mn.n / Vn) — MiniMax-M2.7 ranks above M2.5 on equal composite score; >=100B integers not misread as versions. - /api/cookbook/hf-latest no longer drops models without an "NB" pattern in the repo id (MiniMax-M2.7, DeepSeek-V4-Pro etc. were silently filtered). - Cached-model scan: atexit flushes models JSON even if the script is killed mid-walk; each scan_dir wrapped in try/except; timeout 60s -> 180s. - KB granularity for sub-MB sizes (was "0 MB" for 12 KB shells). New "stalled" status for shells <1 MB with no .incomplete files. - /api/cookbook/state POST guard: rejects "done" download tasks lacking DOWNLOAD_OK / DOWNLOAD_FAILED / /snapshots/ when the last-mentioned shard is N<total — stops stale tabs from poisoning persisted state. - hf_models.json: add zai-org/GLM-5.1; flip zai-org/GLM-5 quantization Q4_K_M -> BF16 (it is the native base, not a quant). Frontend (static/js): - Scan/Download toolbar: quant defaults to All; ctx slider (8k/16k/32k/ 50k/128k/Max) ported from origin/main with sort=fit on drag, sort=score on Max. GPU toggle commits _activeCount to maxGpu on initial render. Fit column header tagged with active budget (RAM / GPU / N GPU). - Foldable Download admin-card: the Download h2 is the chevron trigger; state persists in localStorage. - Download card surfaces destination dir (Dir: <path>). Same dir on running task row, font/color matched to uptime (9px Fira Code muted, opacity .4). - Serve panel ctx text input always resets to model max on open. Sub-MB cached models show with red "download stalled" badge. - Bulk-select Cancel + Delete reset the Select button label on exit. - Cookbook running: false-finished bug fixed — DOWNLOAD_OK or /snapshots/ required; bare "Download complete" no longer marks the task done after the first config file. Clear button now sends tmux kill-session too. True overall % for multi-shard downloads: ((N-1)+frac)/total instead of hf_transfer per-shard aggregate. - Diagnosis card simplified: removed fold toggle, copy button, dismiss X. Suggestion font matches message body (12px). - HF token field flashes green check + "Saved" on save. - Cached scan no longer counts stalled rows as downloaded in Scan/Download. CSS: - dep Install button width pinned to 76px to match Installed split. - task-sub row +1px; task-status badge gets margin-right 8px. - Ctx slider styled like gallery editor sliders (thin pill rail, red thumb). - Bulk-select cancel button top -3px -> -5px.
212 lines
10 KiB
Python
212 lines
10 KiB
Python
from copy import deepcopy
|
||
|
||
from fastapi import APIRouter
|
||
|
||
|
||
def setup_hwfit_routes():
|
||
router = APIRouter(prefix="/api/hwfit", tags=["hwfit"])
|
||
|
||
def _apply_manual_hardware(system, manual_mode="", manual_gpu_count="", manual_vram_gb="", manual_ram_gb="", manual_backend=""):
|
||
"""Manual hardware is a "what if I had this setup" simulator —
|
||
REPLACES the detected hardware entirely instead of adding to it.
|
||
|
||
The previous additive behavior averaged the manual VRAM across
|
||
all GPUs (base + manual), which meant adding "1× 400 GB" on top
|
||
of "2× 70 GB" only nudged the per-GPU cap from 70 to 180 GB
|
||
(= 540 / 3), so GGUF models bigger than that still didn't surface
|
||
— exactly the "cap stuck at detected level" bug the user hit.
|
||
"""
|
||
manual_mode = (manual_mode or "").lower()
|
||
if manual_mode not in {"gpu", "ram"}:
|
||
return system
|
||
|
||
try:
|
||
override_ram_gb = float(manual_ram_gb) if manual_ram_gb else 0
|
||
except ValueError:
|
||
override_ram_gb = 0
|
||
override_ram_gb = max(0.0, override_ram_gb)
|
||
if override_ram_gb:
|
||
# Replace RAM, don't add. The number in the field is the
|
||
# TOTAL system memory the user wants to simulate.
|
||
system["available_ram_gb"] = round(override_ram_gb, 1)
|
||
system["total_ram_gb"] = round(override_ram_gb, 1)
|
||
system["manual_hardware"] = True
|
||
|
||
if manual_mode == "ram":
|
||
# RAM-only simulation — wipe GPU entirely so the ranker uses
|
||
# CPU/RAM paths.
|
||
system["has_gpu"] = False
|
||
system["gpu_name"] = None
|
||
system["gpu_vram_gb"] = 0
|
||
system["gpu_count"] = 0
|
||
system["gpus"] = []
|
||
system["gpu_groups"] = []
|
||
system["backend"] = "cpu_x86"
|
||
return system
|
||
|
||
try:
|
||
count = int(manual_gpu_count) if manual_gpu_count else 1
|
||
except ValueError:
|
||
count = 1
|
||
try:
|
||
vram_each = float(manual_vram_gb) if manual_vram_gb else 8.0
|
||
except ValueError:
|
||
vram_each = 8.0
|
||
count = max(1, min(count, 16))
|
||
vram_each = max(1.0, vram_each)
|
||
backend = (manual_backend or system.get("backend") or "cuda").lower()
|
||
if backend not in {"cuda", "rocm", "cpu_x86", "cpu_arm"}:
|
||
backend = "cuda"
|
||
total_vram = round(vram_each * count, 1)
|
||
gpu_name = f"Simulated {backend.upper()} GPU" + (f" × {count}" if count > 1 else "")
|
||
system["has_gpu"] = True
|
||
system["gpu_name"] = gpu_name
|
||
system["gpu_vram_gb"] = total_vram
|
||
system["gpu_count"] = count
|
||
system["gpus"] = [
|
||
{"index": i, "name": gpu_name, "vram_gb": vram_each}
|
||
for i in range(count)
|
||
]
|
||
# Single homogeneous pool — vram_each here is the ACTUAL per-GPU
|
||
# VRAM the user entered, not an average. That's the whole point:
|
||
# raising vram_each lifts the per-GPU cap (GGUF, tensor-parallel
|
||
# math) all the way up, not just by a small fraction.
|
||
system["gpu_groups"] = [{
|
||
"name": gpu_name,
|
||
"vram_each": vram_each,
|
||
"count": count,
|
||
"indices": list(range(count)),
|
||
"vram_total": total_vram,
|
||
}]
|
||
system["homogeneous"] = True
|
||
system["backend"] = backend
|
||
return system
|
||
|
||
@router.get("/system")
|
||
def get_system(host: str = "", ssh_port: str = "", platform: str = "", fresh: bool = False):
|
||
"""Detect and return current system hardware info. Pass host=user@server for remote.
|
||
fresh=true bypasses the per-host cache (the Rescan button)."""
|
||
from services.hwfit.hardware import detect_system
|
||
return detect_system(host=host, ssh_port=ssh_port, platform=platform, fresh=fresh)
|
||
|
||
@router.get("/models")
|
||
def get_models(use_case: str = "", sort: str = "score", limit: int = 50, search: str = "", host: str = "", quant: str = "", ctx: str = "", gpu_count: str = "", gpu_group: str = "", ssh_port: str = "", platform: str = "", fresh: bool = False, manual_mode: str = "", manual_gpu_count: str = "", manual_vram_gb: str = "", manual_ram_gb: str = "", manual_backend: str = "", ignore_detected_gpu: bool = False, ignore_detected_ram: bool = False, fit_only: bool = False):
|
||
"""Rank LLM models against detected hardware and return scored results.
|
||
gpu_count: override GPU count (0 = CPU only, 1-N = simulate N GPUs of the
|
||
active group). gpu_group: index into system.gpu_groups (the homogeneous
|
||
pools) to target — empty/auto = the largest pool. vLLM can only
|
||
tensor-parallel across identical GPUs, so we never mix pools.
|
||
fresh=true bypasses the hardware-detection cache."""
|
||
from services.hwfit.hardware import detect_system
|
||
from services.hwfit.fit import rank_models
|
||
from services.hwfit.models import get_models, model_catalog_path
|
||
system = deepcopy(detect_system(host=host, ssh_port=ssh_port, platform=platform, fresh=fresh))
|
||
if system.get("error"):
|
||
return {"system": system, "models": [], "error": system["error"]}
|
||
if not get_models():
|
||
return {
|
||
"system": system,
|
||
"models": [],
|
||
"error": f"Model catalog missing or empty: {model_catalog_path()}",
|
||
}
|
||
|
||
if ignore_detected_gpu:
|
||
system["has_gpu"] = False
|
||
system["gpu_name"] = None
|
||
system["gpu_vram_gb"] = 0
|
||
system["gpu_count"] = 0
|
||
system["gpus"] = []
|
||
system["gpu_groups"] = []
|
||
if ignore_detected_ram:
|
||
system["available_ram_gb"] = 0
|
||
system["total_ram_gb"] = 0
|
||
|
||
system = _apply_manual_hardware(system, manual_mode, manual_gpu_count, manual_vram_gb, manual_ram_gb, manual_backend)
|
||
|
||
# Keep the raw detection around so the UI can still show the box's full
|
||
# GPU complement even while we rank against one homogeneous pool.
|
||
system["detected_gpu_vram_gb"] = system.get("gpu_vram_gb")
|
||
system["detected_gpu_count"] = system.get("gpu_count")
|
||
|
||
groups = system.get("gpu_groups") or []
|
||
# Resolve the target homogeneous pool. Default (auto) = the largest pool,
|
||
# which for a uniform box is simply "all the GPUs" — no behaviour change.
|
||
grp = None
|
||
if groups:
|
||
try:
|
||
gidx = int(gpu_group) if gpu_group != "" else 0
|
||
except ValueError:
|
||
gidx = 0
|
||
if 0 <= gidx < len(groups):
|
||
grp = groups[gidx]
|
||
|
||
def _apply_group(g, n):
|
||
n = max(1, min(n, g["count"]))
|
||
system["gpu_count"] = n
|
||
system["gpu_vram_gb"] = round(g["vram_each"] * n, 1)
|
||
system["gpu_name"] = g["name"]
|
||
system["active_group"] = {**g, "use_count": n}
|
||
|
||
if gpu_count != "":
|
||
n = int(gpu_count)
|
||
if n == 0:
|
||
# RAM-only mode: rank against system memory, offload allowed.
|
||
system["has_gpu"] = False
|
||
system["gpu_vram_gb"] = 0
|
||
system["gpu_count"] = 0
|
||
system["gpu_only"] = False
|
||
system.pop("active_group", None)
|
||
elif grp:
|
||
_apply_group(grp, n)
|
||
system["gpu_only"] = True
|
||
else:
|
||
# No per-GPU detail (older detection) — assume uniform split.
|
||
single_vram = (system.get("gpu_vram_gb") or 0) / (system.get("gpu_count") or 1)
|
||
system["gpu_count"] = max(1, n)
|
||
system["gpu_vram_gb"] = round(single_vram * max(1, n), 1)
|
||
system["gpu_only"] = True
|
||
elif grp:
|
||
# No explicit count, but we still pin to one pool so heterogeneous
|
||
# boxes rank against a real mixable group, not a fictional VRAM sum.
|
||
# gpu_only stays off here so the default view still surfaces offload.
|
||
_apply_group(grp, grp["count"])
|
||
|
||
try:
|
||
target_context = int(ctx) if ctx else None
|
||
except ValueError:
|
||
target_context = None
|
||
if target_context is not None:
|
||
target_context = max(1024, min(target_context, 1000000))
|
||
|
||
results = rank_models(system, use_case=use_case or None, limit=limit, search=search or None, sort=sort, quant=quant or None, target_context=target_context, fit_only=fit_only)
|
||
return {"system": system, "models": results}
|
||
|
||
@router.get("/image-models")
|
||
def get_image_models(sort: str = "fit", search: str = "", host: str = "", gpu_count: str = "", ssh_port: str = "", platform: str = "", fresh: bool = False, manual_mode: str = "", manual_gpu_count: str = "", manual_vram_gb: str = "", manual_ram_gb: str = "", manual_backend: str = "", ignore_detected_gpu: bool = False, ignore_detected_ram: bool = False):
|
||
"""Rank image generation models against detected hardware."""
|
||
from services.hwfit.hardware import detect_system
|
||
from services.hwfit.image_models import rank_image_models
|
||
system = deepcopy(detect_system(host=host, ssh_port=ssh_port, platform=platform, fresh=fresh))
|
||
if system.get("error"):
|
||
return {"system": system, "models": [], "error": system["error"]}
|
||
if ignore_detected_gpu:
|
||
system["has_gpu"] = False
|
||
system["gpu_name"] = None
|
||
system["gpu_vram_gb"] = 0
|
||
system["gpu_count"] = 0
|
||
system["gpus"] = []
|
||
system["gpu_groups"] = []
|
||
if ignore_detected_ram:
|
||
system["available_ram_gb"] = 0
|
||
system["total_ram_gb"] = 0
|
||
system = _apply_manual_hardware(system, manual_mode, manual_gpu_count, manual_vram_gb, manual_ram_gb, manual_backend)
|
||
# Image models use a single GPU — always use per-GPU VRAM
|
||
gpu_vrams = [float(g.get("vram_gb") or 0) for g in (system.get("gpus") or []) if isinstance(g, dict)]
|
||
single_vram = max(gpu_vrams) if gpu_vrams else ((system.get("gpu_vram_gb") or 0) / max(system.get("gpu_count") or 1, 1))
|
||
system["gpu_vram_gb"] = single_vram
|
||
system["gpu_count"] = 1 if single_vram > 0 else 0
|
||
results = rank_image_models(system, search=search or None, sort=sort)
|
||
return {"system": system, "models": results}
|
||
|
||
return router
|