mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-30 00:22:10 -04:00
Merge remote-tracking branch 'origin/main' into visual-pr-playground
# Conflicts: # routes/cookbook_routes.py # routes/hwfit_routes.py # services/hwfit/fit.py # services/hwfit/models.py # static/js/cookbook-diagnosis.js # static/js/cookbook-hwfit.js # static/js/cookbook.js # static/js/cookbookRunning.js
This commit is contained in:
@@ -5,7 +5,9 @@ import re
|
||||
QUANT_HIERARCHY = ["Q8_0", "Q6_K", "Q5_K_M", "Q4_K_M", "Q3_K_M", "Q2_K"]
|
||||
|
||||
QUANT_BPP = {
|
||||
"F32": 4.0, "F16": 2.0, "BF16": 2.0, "FP8": 1.0, "INT8": 1.0, "NVFP4": 0.5,
|
||||
"F32": 4.0, "F16": 2.0, "BF16": 2.0, "FP8": 1.0,
|
||||
"FP4": 0.50, "NVFP4": 0.50, "MXFP4": 0.50, "NF4": 0.50,
|
||||
"INT4": 0.50, "INT8": 1.0, "W4A16": 0.50, "W8A8": 1.0, "W8A16": 1.0,
|
||||
"Q8_0": 1.05, "Q6_K": 0.80, "Q5_K_M": 0.68,
|
||||
"Q4_K_M": 0.58, "Q4_0": 0.58, "Q3_K_M": 0.48, "Q2_K": 0.37,
|
||||
"AWQ-4bit": 0.50, "AWQ-8bit": 1.0,
|
||||
@@ -14,7 +16,9 @@ QUANT_BPP = {
|
||||
}
|
||||
|
||||
QUANT_SPEED_MULT = {
|
||||
"F16": 0.6, "BF16": 0.6, "FP8": 0.85, "INT8": 0.85, "NVFP4": 1.1,
|
||||
"F16": 0.6, "BF16": 0.6, "FP8": 0.85,
|
||||
"FP4": 1.15, "NVFP4": 1.15, "MXFP4": 1.15, "NF4": 1.10,
|
||||
"INT4": 1.15, "INT8": 0.85, "W4A16": 1.15, "W8A8": 0.85, "W8A16": 0.85,
|
||||
"Q8_0": 0.8, "Q6_K": 0.95, "Q5_K_M": 1.0,
|
||||
"Q4_K_M": 1.15, "Q4_0": 1.15, "Q3_K_M": 1.25, "Q2_K": 1.35,
|
||||
"AWQ-4bit": 1.2, "AWQ-8bit": 0.85,
|
||||
@@ -23,8 +27,10 @@ QUANT_SPEED_MULT = {
|
||||
}
|
||||
|
||||
QUANT_QUALITY_PENALTY = {
|
||||
"F16": 0.0, "BF16": 0.0, "FP8": 0.0, "INT8": 0.0, "NVFP4": -0.5,
|
||||
"Q8_0": -0.5, "Q6_K": -1.5, "Q5_K_M": -2.5,
|
||||
"F16": 0.0, "BF16": 0.0, "FP8": 0.0,
|
||||
"FP4": -3.0, "NVFP4": -3.0, "MXFP4": -3.0, "NF4": -4.0,
|
||||
"INT4": -4.0, "INT8": 0.0, "W4A16": -4.0, "W8A8": 0.0, "W8A16": 0.0,
|
||||
"Q8_0": 0.0, "Q6_K": -1.0, "Q5_K_M": -2.0,
|
||||
"Q4_K_M": -5.0, "Q4_0": -5.0, "Q3_K_M": -8.0, "Q2_K": -12.0,
|
||||
# Bare "AWQ" and "AWQ-8bit" used to be 0.0 (tied with FP8). In practice
|
||||
# AWQ-anything is a calibrated reconstruction, not raw 8-bit weights —
|
||||
@@ -36,7 +42,9 @@ QUANT_QUALITY_PENALTY = {
|
||||
}
|
||||
|
||||
QUANT_BYTES_PER_PARAM = {
|
||||
"F16": 2.0, "BF16": 2.0, "FP8": 1.0, "INT8": 1.0, "NVFP4": 0.5,
|
||||
"F16": 2.0, "BF16": 2.0, "FP8": 1.0,
|
||||
"FP4": 0.5, "NVFP4": 0.5, "MXFP4": 0.5, "NF4": 0.5,
|
||||
"INT4": 0.5, "INT8": 1.0, "W4A16": 0.5, "W8A8": 1.0, "W8A16": 1.0,
|
||||
"Q8_0": 1.0, "Q6_K": 0.75, "Q5_K_M": 0.625,
|
||||
"Q4_K_M": 0.5, "Q4_0": 0.5, "Q3_K_M": 0.375, "Q2_K": 0.25,
|
||||
"AWQ-4bit": 0.5, "AWQ-8bit": 1.0,
|
||||
@@ -44,8 +52,55 @@ QUANT_BYTES_PER_PARAM = {
|
||||
"mlx-4bit": 0.5, "mlx-8bit": 1.0, "mlx-6bit": 0.75,
|
||||
}
|
||||
|
||||
# Pre-quantized formats that should NOT go through the GGUF quant hierarchy
|
||||
PREQUANTIZED_PREFIXES = ("AWQ-", "GPTQ-", "mlx-", "FP8", "INT8", "NVFP4")
|
||||
# Pre-quantized formats that should NOT go through the GGUF quant hierarchy.
|
||||
# These are native HF/vLLM-style repos, not llama.cpp GGUF quant tiers.
|
||||
PREQUANTIZED_PREFIXES = (
|
||||
"AWQ-", "GPTQ-", "mlx-", "FP8", "FP4", "NVFP4", "MXFP4", "NF4",
|
||||
"INT4", "INT8", "W4A16", "W8A8", "W8A16",
|
||||
)
|
||||
|
||||
|
||||
def infer_quantization_from_name(name):
|
||||
n = (name or "").lower()
|
||||
if "nvfp4" in n:
|
||||
return "NVFP4"
|
||||
if "mxfp4" in n:
|
||||
return "MXFP4"
|
||||
if re.search(r"(^|[-_/])nf4($|[-_/])", n):
|
||||
return "NF4"
|
||||
if re.search(r"(^|[-_/])fp4($|[-_/])", n):
|
||||
return "FP4"
|
||||
if re.search(r"(^|[-_/])w4a16($|[-_/])", n):
|
||||
return "W4A16"
|
||||
if re.search(r"(^|[-_/])w8a8($|[-_/])", n):
|
||||
return "W8A8"
|
||||
if re.search(r"(^|[-_/])w8a16($|[-_/])", n):
|
||||
return "W8A16"
|
||||
is8 = "8bit" in n or "8-bit" in n or "int8" in n
|
||||
if "awq" in n:
|
||||
return "AWQ-8bit" if is8 else "AWQ-4bit"
|
||||
if "gptq" in n:
|
||||
return "GPTQ-Int8" if is8 else "GPTQ-Int4"
|
||||
if "mlx" in n:
|
||||
if "6bit" in n:
|
||||
return "mlx-6bit"
|
||||
return "mlx-8bit" if is8 else "mlx-4bit"
|
||||
if "fp8" in n:
|
||||
return "FP8"
|
||||
if "int4" in n or "4bit" in n or "4-bit" in n:
|
||||
return "INT4"
|
||||
if "int8" in n or "8bit" in n or "8-bit" in n:
|
||||
return "INT8"
|
||||
return ""
|
||||
|
||||
|
||||
def _normalize_model_entry(model):
|
||||
if not isinstance(model, dict):
|
||||
return model
|
||||
inferred = infer_quantization_from_name(model.get("name", ""))
|
||||
if inferred and (model.get("quantization") in (None, "", "Q4_K_M") or model.get("_discovered")):
|
||||
model["quantization"] = inferred
|
||||
return model
|
||||
|
||||
|
||||
def is_prequantized(model):
|
||||
@@ -72,7 +127,13 @@ def params_b(model):
|
||||
pc = pc.strip().upper()
|
||||
m = re.match(r"^([\d.]+)\s*([BKMGT]?)$", pc)
|
||||
if m:
|
||||
val = float(m.group(1))
|
||||
try:
|
||||
val = float(m.group(1))
|
||||
except ValueError:
|
||||
# Malformed count like "1.5.3B" — [\d.]+ matches but float()
|
||||
# rejects it. One bad catalog row must not abort the whole
|
||||
# ranking pass, so treat it as unknown size.
|
||||
return 0.0
|
||||
suffix = m.group(2)
|
||||
if suffix == "B":
|
||||
return val
|
||||
@@ -180,7 +241,7 @@ def get_models():
|
||||
data_path = os.path.join(os.path.dirname(__file__), "data", "hf_models.json")
|
||||
try:
|
||||
with open(data_path, encoding="utf-8") as f:
|
||||
_models_cache = json.load(f)
|
||||
_models_cache = [_normalize_model_entry(m) for m in json.load(f)]
|
||||
except (FileNotFoundError, json.JSONDecodeError):
|
||||
_models_cache = []
|
||||
return _models_cache
|
||||
|
||||
Reference in New Issue
Block a user