mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-16 17:55:26 -04:00
Fix native Cookbook quant classification
This commit is contained in:
+30
-27
@@ -219,9 +219,9 @@ def _quant_bits(q):
|
||||
Returns 0 when unknown (caller treats unknown as "don't filter")."""
|
||||
qu = (q or "").upper().replace("-", "").replace("_", "").replace(" ", "")
|
||||
# GGUF k-quants + float formats
|
||||
if qu.startswith("Q8") or "FP8" in qu:
|
||||
if qu.startswith("Q8") or "FP8" in qu or "INT8" in qu or qu.startswith("W8"):
|
||||
return 8
|
||||
if qu.startswith("Q4") or qu.startswith("IQ4"):
|
||||
if qu.startswith("Q4") or qu.startswith("IQ4") or "FP4" in qu or "NF4" in qu or "INT4" in qu or qu.startswith("W4"):
|
||||
return 4
|
||||
if qu.startswith("Q2") or qu.startswith("IQ2"):
|
||||
return 2
|
||||
@@ -233,7 +233,7 @@ def _quant_bits(q):
|
||||
return 6
|
||||
if qu.startswith("F16") or qu.startswith("BF16") or qu.startswith("F32"):
|
||||
return 16
|
||||
# Prequantized formats: pull the bit-width digit (AWQ4 / AWQ4BIT / GPTQ8 / 4BIT / INT8 …)
|
||||
# Prequantized formats: pull the bit-width digit (AWQ4 / AWQ4BIT / GPTQ8 / 4BIT / INT8 ...)
|
||||
m = re.search(r"(?:AWQ|GPTQ|MLX|EXL2|BNB|INT|W)(\d{1,2})", qu) or re.search(r"(\d{1,2})BIT", qu)
|
||||
if m:
|
||||
b = int(m.group(1))
|
||||
@@ -282,15 +282,21 @@ def analyze_model(model, system, target_quant=None, scoring_use_case=None):
|
||||
else:
|
||||
effective_vram = gpu_vram
|
||||
|
||||
native_gpu_only = preq and not native_quant.startswith("mlx-")
|
||||
|
||||
# Determine which quant to evaluate at
|
||||
native_quant_prefixes = (
|
||||
"AWQ-", "GPTQ-", "FP8", "FP4", "NVFP4", "MXFP4", "NF4",
|
||||
"INT4", "INT8", "W4A16", "W8A8", "W8A16",
|
||||
)
|
||||
|
||||
if preq:
|
||||
# AWQ/GPTQ/FP8/MLX come at a fixed bit-width. If the user picked a
|
||||
# GGUF quant tier (Q4/Q8/etc.), do not treat a same-bit AWQ/GPTQ build
|
||||
# as equivalent. "Q4" means llama.cpp/Ollama-style GGUF in this UI;
|
||||
# AWQ/GPTQ/FP8 are separate GPU-serving formats and must only appear
|
||||
# when explicitly selected or when no quant filter is applied.
|
||||
# Native HF/vLLM quantized repos come at a fixed format. If the user
|
||||
# picked a GGUF quant tier (Q4/Q8/etc.), do not treat same-bit
|
||||
# AWQ/GPTQ/FP8/FP4 builds as equivalent; those formats are separate
|
||||
# serving paths and only appear when explicitly selected or unfiltered.
|
||||
if target_quant:
|
||||
if not any(target_quant.startswith(p) for p in ("AWQ-", "GPTQ-", "FP8", "NVFP4")):
|
||||
if not any(target_quant.startswith(p) for p in native_quant_prefixes):
|
||||
return None
|
||||
_tb, _nb = _quant_bits(target_quant), _quant_bits(native_quant)
|
||||
if _tb and _nb and _tb != _nb:
|
||||
@@ -303,16 +309,7 @@ def analyze_model(model, system, target_quant=None, scoring_use_case=None):
|
||||
# Default: Q4_K_M (user's stated preference)
|
||||
quant_to_try = "Q4_K_M"
|
||||
|
||||
result = _try_quant_at(model, quant_to_try, ctx, effective_vram, eff_ram)
|
||||
|
||||
# If target quant doesn't fit and it's not pre-quantized, try lower quants
|
||||
if result is None and not preq and target_quant:
|
||||
from services.hwfit.models import QUANT_HIERARCHY
|
||||
idx = QUANT_HIERARCHY.index(target_quant) if target_quant in QUANT_HIERARCHY else -1
|
||||
for q in QUANT_HIERARCHY[idx + 1:]:
|
||||
result = _try_quant_at(model, q, ctx, effective_vram, eff_ram)
|
||||
if result:
|
||||
break
|
||||
result = _try_quant_at(model, quant_to_try, ctx, effective_vram, 0 if native_gpu_only else eff_ram)
|
||||
|
||||
if result is None:
|
||||
# Model doesn't fit on the user's current hardware. Surface it
|
||||
@@ -447,8 +444,11 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
|
||||
results.sort(key=sort_fn, reverse=(sort != "vram"))
|
||||
return results[:limit]
|
||||
|
||||
# If user picked a prequantized format (AWQ/FP8/GPTQ/NVFP4), filter to only those models
|
||||
filter_native = quant and any(quant.startswith(p) for p in ("AWQ-", "GPTQ-", "FP8", "NVFP4"))
|
||||
# If user picked a native prequantized format, filter to only those models.
|
||||
filter_native = quant and any(quant.startswith(p) for p in (
|
||||
"AWQ-", "GPTQ-", "FP8", "FP4", "NVFP4", "MXFP4", "NF4",
|
||||
"INT4", "INT8", "W4A16", "W8A8", "W8A16",
|
||||
))
|
||||
|
||||
system_backend = (system.get("backend") or "").lower()
|
||||
apple_silicon = system_backend in ("mps", "metal", "apple")
|
||||
@@ -459,9 +459,9 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
|
||||
if "nvfp4" in (m.get("name") or "").lower():
|
||||
native_q = "NVFP4"
|
||||
|
||||
# MLX is Apple Silicon only. Hide MLX rows on non-Mac hardware scans,
|
||||
# but leave them visible on Metal/MPS so Mac support is not broken.
|
||||
if not apple_silicon and (native_q.startswith("mlx-") or "mlx" in (m.get("name") or "").lower()):
|
||||
# MLX needs the mlx_lm runtime, which Odysseus does not generate serve
|
||||
# commands for. Hide it on every backend, including Metal.
|
||||
if native_q.startswith("mlx-") or "mlx" in (m.get("name") or "").lower():
|
||||
continue
|
||||
|
||||
# ROCm support for vLLM/SGLang quantized safetensors is too brittle to
|
||||
@@ -479,20 +479,23 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
|
||||
# default GGUF quant) and vLLM-only AWQ/GPTQ/FP8 builds alike. Without
|
||||
# this the Cookbook recommends models the Mac can't run; on CUDA these
|
||||
# stay visible because vLLM serves safetensors directly.
|
||||
is_mlx = native_q.startswith("mlx-") or "mlx" in (m.get("name") or "").lower()
|
||||
if apple_silicon and not (m.get("is_gguf") or m.get("gguf_sources") or is_mlx):
|
||||
if apple_silicon and not (m.get("is_gguf") or m.get("gguf_sources")):
|
||||
continue
|
||||
|
||||
# Format filter: AWQ tab → only AWQ models, FP8 tab → only FP8 models
|
||||
# Format filter: AWQ tab -> only AWQ models, FP4 tab -> FP4-family models, etc.
|
||||
if filter_native:
|
||||
if quant == "FP8" and native_q != "FP8":
|
||||
continue
|
||||
if quant == "FP4" and native_q not in ("FP4", "NVFP4", "MXFP4", "NF4"):
|
||||
continue
|
||||
if quant.startswith("AWQ") and not native_q.startswith("AWQ"):
|
||||
continue
|
||||
if quant.startswith("GPTQ") and not native_q.startswith("GPTQ"):
|
||||
continue
|
||||
if quant.startswith("NVFP4") and not native_q.startswith("NVFP4"):
|
||||
continue
|
||||
if quant in ("INT4", "INT8", "W4A16", "W8A8", "W8A16") and native_q != quant:
|
||||
continue
|
||||
|
||||
if search:
|
||||
name = m.get("name", "").lower()
|
||||
|
||||
@@ -5,7 +5,9 @@ import re
|
||||
QUANT_HIERARCHY = ["Q8_0", "Q6_K", "Q5_K_M", "Q4_K_M", "Q3_K_M", "Q2_K"]
|
||||
|
||||
QUANT_BPP = {
|
||||
"F32": 4.0, "F16": 2.0, "BF16": 2.0, "FP8": 1.0, "NVFP4": 0.5,
|
||||
"F32": 4.0, "F16": 2.0, "BF16": 2.0, "FP8": 1.0,
|
||||
"FP4": 0.50, "NVFP4": 0.50, "MXFP4": 0.50, "NF4": 0.50,
|
||||
"INT4": 0.50, "INT8": 1.0, "W4A16": 0.50, "W8A8": 1.0, "W8A16": 1.0,
|
||||
"Q8_0": 1.05, "Q6_K": 0.80, "Q5_K_M": 0.68,
|
||||
"Q4_K_M": 0.58, "Q4_0": 0.58, "Q3_K_M": 0.48, "Q2_K": 0.37,
|
||||
"AWQ-4bit": 0.50, "AWQ-8bit": 1.0,
|
||||
@@ -14,7 +16,9 @@ QUANT_BPP = {
|
||||
}
|
||||
|
||||
QUANT_SPEED_MULT = {
|
||||
"F16": 0.6, "BF16": 0.6, "FP8": 0.85, "NVFP4": 1.1,
|
||||
"F16": 0.6, "BF16": 0.6, "FP8": 0.85,
|
||||
"FP4": 1.15, "NVFP4": 1.15, "MXFP4": 1.15, "NF4": 1.10,
|
||||
"INT4": 1.15, "INT8": 0.85, "W4A16": 1.15, "W8A8": 0.85, "W8A16": 0.85,
|
||||
"Q8_0": 0.8, "Q6_K": 0.95, "Q5_K_M": 1.0,
|
||||
"Q4_K_M": 1.15, "Q4_0": 1.15, "Q3_K_M": 1.25, "Q2_K": 1.35,
|
||||
"AWQ-4bit": 1.2, "AWQ-8bit": 0.85,
|
||||
@@ -23,7 +27,9 @@ QUANT_SPEED_MULT = {
|
||||
}
|
||||
|
||||
QUANT_QUALITY_PENALTY = {
|
||||
"F16": 0.0, "BF16": 0.0, "FP8": 0.0, "NVFP4": 0.0,
|
||||
"F16": 0.0, "BF16": 0.0, "FP8": 0.0,
|
||||
"FP4": -3.0, "NVFP4": -3.0, "MXFP4": -3.0, "NF4": -4.0,
|
||||
"INT4": -4.0, "INT8": 0.0, "W4A16": -4.0, "W8A8": 0.0, "W8A16": 0.0,
|
||||
"Q8_0": 0.0, "Q6_K": -1.0, "Q5_K_M": -2.0,
|
||||
"Q4_K_M": -5.0, "Q4_0": -5.0, "Q3_K_M": -8.0, "Q2_K": -12.0,
|
||||
"AWQ-4bit": -3.0, "AWQ-8bit": 0.0,
|
||||
@@ -32,7 +38,9 @@ QUANT_QUALITY_PENALTY = {
|
||||
}
|
||||
|
||||
QUANT_BYTES_PER_PARAM = {
|
||||
"F16": 2.0, "BF16": 2.0, "FP8": 1.0, "NVFP4": 0.5,
|
||||
"F16": 2.0, "BF16": 2.0, "FP8": 1.0,
|
||||
"FP4": 0.5, "NVFP4": 0.5, "MXFP4": 0.5, "NF4": 0.5,
|
||||
"INT4": 0.5, "INT8": 1.0, "W4A16": 0.5, "W8A8": 1.0, "W8A16": 1.0,
|
||||
"Q8_0": 1.0, "Q6_K": 0.75, "Q5_K_M": 0.625,
|
||||
"Q4_K_M": 0.5, "Q4_0": 0.5, "Q3_K_M": 0.375, "Q2_K": 0.25,
|
||||
"AWQ-4bit": 0.5, "AWQ-8bit": 1.0,
|
||||
@@ -40,14 +48,60 @@ QUANT_BYTES_PER_PARAM = {
|
||||
"mlx-4bit": 0.5, "mlx-8bit": 1.0, "mlx-6bit": 0.75,
|
||||
}
|
||||
|
||||
# Pre-quantized formats that should NOT go through the GGUF quant hierarchy
|
||||
PREQUANTIZED_PREFIXES = ("AWQ-", "GPTQ-", "mlx-", "FP8", "NVFP4")
|
||||
# Pre-quantized formats that should NOT go through the GGUF quant hierarchy.
|
||||
# These are native HF/vLLM-style repos, not llama.cpp GGUF quant tiers.
|
||||
PREQUANTIZED_PREFIXES = (
|
||||
"AWQ-", "GPTQ-", "mlx-", "FP8", "FP4", "NVFP4", "MXFP4", "NF4",
|
||||
"INT4", "INT8", "W4A16", "W8A8", "W8A16",
|
||||
)
|
||||
|
||||
|
||||
def infer_quantization_from_name(name):
|
||||
n = (name or "").lower()
|
||||
if "nvfp4" in n:
|
||||
return "NVFP4"
|
||||
if "mxfp4" in n:
|
||||
return "MXFP4"
|
||||
if re.search(r"(^|[-_/])nf4($|[-_/])", n):
|
||||
return "NF4"
|
||||
if re.search(r"(^|[-_/])fp4($|[-_/])", n):
|
||||
return "FP4"
|
||||
if re.search(r"(^|[-_/])w4a16($|[-_/])", n):
|
||||
return "W4A16"
|
||||
if re.search(r"(^|[-_/])w8a8($|[-_/])", n):
|
||||
return "W8A8"
|
||||
if re.search(r"(^|[-_/])w8a16($|[-_/])", n):
|
||||
return "W8A16"
|
||||
is8 = "8bit" in n or "8-bit" in n or "int8" in n
|
||||
if "awq" in n:
|
||||
return "AWQ-8bit" if is8 else "AWQ-4bit"
|
||||
if "gptq" in n:
|
||||
return "GPTQ-Int8" if is8 else "GPTQ-Int4"
|
||||
if "mlx" in n:
|
||||
if "6bit" in n:
|
||||
return "mlx-6bit"
|
||||
return "mlx-8bit" if is8 else "mlx-4bit"
|
||||
if "fp8" in n:
|
||||
return "FP8"
|
||||
if "int4" in n or "4bit" in n or "4-bit" in n:
|
||||
return "INT4"
|
||||
if "int8" in n or "8bit" in n or "8-bit" in n:
|
||||
return "INT8"
|
||||
return ""
|
||||
|
||||
|
||||
def _normalize_model_entry(model):
|
||||
if not isinstance(model, dict):
|
||||
return model
|
||||
inferred = infer_quantization_from_name(model.get("name", ""))
|
||||
if inferred and (model.get("quantization") in (None, "", "Q4_K_M") or model.get("_discovered")):
|
||||
model["quantization"] = inferred
|
||||
return model
|
||||
|
||||
|
||||
def is_prequantized(model):
|
||||
q = model.get("quantization", "")
|
||||
name = (model.get("name") or "").lower()
|
||||
return "nvfp4" in name or any(q.startswith(p) for p in PREQUANTIZED_PREFIXES)
|
||||
return any(q.startswith(p) for p in PREQUANTIZED_PREFIXES)
|
||||
|
||||
|
||||
def params_b(model):
|
||||
@@ -168,7 +222,7 @@ def get_models():
|
||||
data_path = os.path.join(os.path.dirname(__file__), "data", "hf_models.json")
|
||||
try:
|
||||
with open(data_path, encoding="utf-8") as f:
|
||||
_models_cache = json.load(f)
|
||||
_models_cache = [_normalize_model_entry(m) for m in json.load(f)]
|
||||
except (FileNotFoundError, json.JSONDecodeError):
|
||||
_models_cache = []
|
||||
return _models_cache
|
||||
|
||||
Reference in New Issue
Block a user