mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-17 10:15:27 -04:00
Cookbook: scoring fixes, UI polish, false-finished + stale-state bug fixes
Backend (services/hwfit + routes): - rank_models picks visible set by REQUESTED column, not always score — sorting by Param now shows highest-param models PERIOD (incl. too_tight). - New fit_only param. Multi-GPU rigs filter GGUF Q*/IQ quants (vLLM/SGLang cannot serve them); default non-prequantized to BF16 on 2+ GPUs. - AWQ / GPTQ-8bit get a -1.0 quality penalty (was 0.0, tied with FP8), so FP8 wins when both fit. - Version-aware tiebreaker (parse Mn.n / Vn) — MiniMax-M2.7 ranks above M2.5 on equal composite score; >=100B integers not misread as versions. - /api/cookbook/hf-latest no longer drops models without an "NB" pattern in the repo id (MiniMax-M2.7, DeepSeek-V4-Pro etc. were silently filtered). - Cached-model scan: atexit flushes models JSON even if the script is killed mid-walk; each scan_dir wrapped in try/except; timeout 60s -> 180s. - KB granularity for sub-MB sizes (was "0 MB" for 12 KB shells). New "stalled" status for shells <1 MB with no .incomplete files. - /api/cookbook/state POST guard: rejects "done" download tasks lacking DOWNLOAD_OK / DOWNLOAD_FAILED / /snapshots/ when the last-mentioned shard is N<total — stops stale tabs from poisoning persisted state. - hf_models.json: add zai-org/GLM-5.1; flip zai-org/GLM-5 quantization Q4_K_M -> BF16 (it is the native base, not a quant). Frontend (static/js): - Scan/Download toolbar: quant defaults to All; ctx slider (8k/16k/32k/ 50k/128k/Max) ported from origin/main with sort=fit on drag, sort=score on Max. GPU toggle commits _activeCount to maxGpu on initial render. Fit column header tagged with active budget (RAM / GPU / N GPU). - Foldable Download admin-card: the Download h2 is the chevron trigger; state persists in localStorage. - Download card surfaces destination dir (Dir: <path>). Same dir on running task row, font/color matched to uptime (9px Fira Code muted, opacity .4). - Serve panel ctx text input always resets to model max on open. Sub-MB cached models show with red "download stalled" badge. - Bulk-select Cancel + Delete reset the Select button label on exit. - Cookbook running: false-finished bug fixed — DOWNLOAD_OK or /snapshots/ required; bare "Download complete" no longer marks the task done after the first config file. Clear button now sends tmux kill-session too. True overall % for multi-shard downloads: ((N-1)+frac)/total instead of hf_transfer per-shard aggregate. - Diagnosis card simplified: removed fold toggle, copy button, dismiss X. Suggestion font matches message body (12px). - HF token field flashes green check + "Saved" on save. - Cached scan no longer counts stalled rows as downloaded in Scan/Download. CSS: - dep Install button width pinned to 76px to match Installed split. - task-sub row +1px; task-status badge gets margin-right 8px. - Ctx slider styled like gallery editor sliders (thin pill rail, red thumb). - Bulk-select cancel button top -3px -> -5px.
This commit is contained in:
+23
-10
@@ -5,7 +5,7 @@ import re
|
||||
QUANT_HIERARCHY = ["Q8_0", "Q6_K", "Q5_K_M", "Q4_K_M", "Q3_K_M", "Q2_K"]
|
||||
|
||||
QUANT_BPP = {
|
||||
"F32": 4.0, "F16": 2.0, "BF16": 2.0, "FP8": 1.0,
|
||||
"F32": 4.0, "F16": 2.0, "BF16": 2.0, "FP8": 1.0, "INT8": 1.0, "NVFP4": 0.5,
|
||||
"Q8_0": 1.05, "Q6_K": 0.80, "Q5_K_M": 0.68,
|
||||
"Q4_K_M": 0.58, "Q4_0": 0.58, "Q3_K_M": 0.48, "Q2_K": 0.37,
|
||||
"AWQ-4bit": 0.50, "AWQ-8bit": 1.0,
|
||||
@@ -14,7 +14,7 @@ QUANT_BPP = {
|
||||
}
|
||||
|
||||
QUANT_SPEED_MULT = {
|
||||
"F16": 0.6, "BF16": 0.6, "FP8": 0.85,
|
||||
"F16": 0.6, "BF16": 0.6, "FP8": 0.85, "INT8": 0.85, "NVFP4": 1.1,
|
||||
"Q8_0": 0.8, "Q6_K": 0.95, "Q5_K_M": 1.0,
|
||||
"Q4_K_M": 1.15, "Q4_0": 1.15, "Q3_K_M": 1.25, "Q2_K": 1.35,
|
||||
"AWQ-4bit": 1.2, "AWQ-8bit": 0.85,
|
||||
@@ -23,16 +23,20 @@ QUANT_SPEED_MULT = {
|
||||
}
|
||||
|
||||
QUANT_QUALITY_PENALTY = {
|
||||
"F16": 0.0, "BF16": 0.0, "FP8": 0.0,
|
||||
"Q8_0": 0.0, "Q6_K": -1.0, "Q5_K_M": -2.0,
|
||||
"F16": 0.0, "BF16": 0.0, "FP8": 0.0, "INT8": 0.0, "NVFP4": -0.5,
|
||||
"Q8_0": -0.5, "Q6_K": -1.5, "Q5_K_M": -2.5,
|
||||
"Q4_K_M": -5.0, "Q4_0": -5.0, "Q3_K_M": -8.0, "Q2_K": -12.0,
|
||||
"AWQ-4bit": -3.0, "AWQ-8bit": 0.0,
|
||||
"GPTQ-Int4": -3.0, "GPTQ-Int8": 0.0,
|
||||
"mlx-4bit": -4.0, "mlx-8bit": 0.0, "mlx-6bit": -1.0,
|
||||
# Bare "AWQ" and "AWQ-8bit" used to be 0.0 (tied with FP8). In practice
|
||||
# AWQ-anything is a calibrated reconstruction, not raw 8-bit weights —
|
||||
# there's a small but real quality loss vs FP8. Give them a slight
|
||||
# penalty so FP8 wins when both fit. AWQ-4bit stays heavier.
|
||||
"AWQ": -1.0, "AWQ-4bit": -4.0, "AWQ-8bit": -1.0,
|
||||
"GPTQ": -1.0, "GPTQ-Int4": -4.0, "GPTQ-Int8": -1.0,
|
||||
"mlx-4bit": -4.0, "mlx-8bit": -0.5, "mlx-6bit": -1.5,
|
||||
}
|
||||
|
||||
QUANT_BYTES_PER_PARAM = {
|
||||
"F16": 2.0, "BF16": 2.0, "FP8": 1.0,
|
||||
"F16": 2.0, "BF16": 2.0, "FP8": 1.0, "INT8": 1.0, "NVFP4": 0.5,
|
||||
"Q8_0": 1.0, "Q6_K": 0.75, "Q5_K_M": 0.625,
|
||||
"Q4_K_M": 0.5, "Q4_0": 0.5, "Q3_K_M": 0.375, "Q2_K": 0.25,
|
||||
"AWQ-4bit": 0.5, "AWQ-8bit": 1.0,
|
||||
@@ -41,12 +45,21 @@ QUANT_BYTES_PER_PARAM = {
|
||||
}
|
||||
|
||||
# Pre-quantized formats that should NOT go through the GGUF quant hierarchy
|
||||
PREQUANTIZED_PREFIXES = ("AWQ-", "GPTQ-", "mlx-", "FP8")
|
||||
PREQUANTIZED_PREFIXES = ("AWQ-", "GPTQ-", "mlx-", "FP8", "INT8", "NVFP4")
|
||||
|
||||
|
||||
def is_prequantized(model):
|
||||
q = model.get("quantization", "")
|
||||
return any(q.startswith(p) for p in PREQUANTIZED_PREFIXES)
|
||||
name = (model.get("name") or "").lower()
|
||||
fmt = (model.get("format") or "").lower()
|
||||
text = f"{name} {fmt}"
|
||||
return (
|
||||
"nvfp4" in text
|
||||
or re.search(r"(^|[-_/])fp8($|[-_/\s])", text) is not None
|
||||
or (not (model.get("is_gguf") or model.get("gguf_sources")) and re.search(r"(^|[-_/])(?:int)?8bit($|[-_/\s])", text) is not None)
|
||||
or any(x in text for x in ("awq", "gptq", "mlx"))
|
||||
or any(q.startswith(p) for p in PREQUANTIZED_PREFIXES)
|
||||
)
|
||||
|
||||
|
||||
def params_b(model):
|
||||
|
||||
Reference in New Issue
Block a user