mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-17 10:15:27 -04:00
Improve Cookbook serve diagnostics and recommendations
This commit is contained in:
@@ -13919,7 +13919,12 @@
|
||||
"architecture": "gemma4",
|
||||
"pipeline_tag": "image-text-to-text",
|
||||
"release_date": "2026-04-01",
|
||||
"gguf_sources": [],
|
||||
"gguf_sources": [
|
||||
{
|
||||
"repo": "unsloth/gemma-4-E2B-it-GGUF",
|
||||
"provider": "unsloth"
|
||||
}
|
||||
],
|
||||
"capabilities": [
|
||||
"vision"
|
||||
]
|
||||
@@ -13942,7 +13947,12 @@
|
||||
"architecture": "gemma4",
|
||||
"pipeline_tag": "image-text-to-text",
|
||||
"release_date": "2026-04-01",
|
||||
"gguf_sources": [],
|
||||
"gguf_sources": [
|
||||
{
|
||||
"repo": "unsloth/gemma-4-E4B-it-GGUF",
|
||||
"provider": "unsloth"
|
||||
}
|
||||
],
|
||||
"capabilities": [
|
||||
"vision"
|
||||
]
|
||||
@@ -13965,7 +13975,12 @@
|
||||
"architecture": "gemma4",
|
||||
"pipeline_tag": "image-text-to-text",
|
||||
"release_date": "2026-04-01",
|
||||
"gguf_sources": [],
|
||||
"gguf_sources": [
|
||||
{
|
||||
"repo": "unsloth/gemma-4-31B-it-GGUF",
|
||||
"provider": "unsloth"
|
||||
}
|
||||
],
|
||||
"capabilities": [
|
||||
"vision"
|
||||
]
|
||||
@@ -13988,7 +14003,12 @@
|
||||
"architecture": "gemma4",
|
||||
"pipeline_tag": "image-text-to-text",
|
||||
"release_date": "2026-04-01",
|
||||
"gguf_sources": [],
|
||||
"gguf_sources": [
|
||||
{
|
||||
"repo": "unsloth/gemma-4-26B-A4B-it-GGUF",
|
||||
"provider": "unsloth"
|
||||
}
|
||||
],
|
||||
"capabilities": [
|
||||
"vision"
|
||||
]
|
||||
@@ -18719,5 +18739,307 @@
|
||||
"hf_likes": 0,
|
||||
"release_date": "2026-04-19",
|
||||
"_discovered": true
|
||||
},
|
||||
{
|
||||
"name": "Qwen/Qwen3.6-27B-MTP",
|
||||
"provider": "Qwen",
|
||||
"parameter_count": "27.8B",
|
||||
"parameters_raw": 27781427952,
|
||||
"min_ram_gb": 16.6,
|
||||
"recommended_ram_gb": 21.6,
|
||||
"min_vram_gb": 16.6,
|
||||
"quantization": "Q4_K_M",
|
||||
"context_length": 262144,
|
||||
"use_case": "General purpose, coding, MTP",
|
||||
"is_moe": false,
|
||||
"num_experts": null,
|
||||
"active_experts": null,
|
||||
"active_parameters": null,
|
||||
"architecture": "qwen3",
|
||||
"pipeline_tag": "text-generation",
|
||||
"release_date": "2026-04-01",
|
||||
"gguf_sources": [
|
||||
{
|
||||
"repo": "unsloth/Qwen3.6-27B-MTP-GGUF",
|
||||
"provider": "unsloth"
|
||||
}
|
||||
],
|
||||
"capabilities": [
|
||||
"mtp"
|
||||
],
|
||||
"_discovered": true
|
||||
},
|
||||
{
|
||||
"name": "Qwen/Qwen3.6-35B-A3B-MTP",
|
||||
"provider": "Qwen",
|
||||
"parameter_count": "36.0B",
|
||||
"parameters_raw": 35951822704,
|
||||
"min_ram_gb": 21.4,
|
||||
"recommended_ram_gb": 27.8,
|
||||
"min_vram_gb": 21.4,
|
||||
"quantization": "Q4_K_M",
|
||||
"context_length": 262144,
|
||||
"use_case": "General purpose (MoE), MTP",
|
||||
"is_moe": true,
|
||||
"num_experts": null,
|
||||
"active_experts": null,
|
||||
"active_parameters": 3000000000,
|
||||
"architecture": "qwen3_moe",
|
||||
"pipeline_tag": "text-generation",
|
||||
"release_date": "2026-04-01",
|
||||
"gguf_sources": [
|
||||
{
|
||||
"repo": "unsloth/Qwen3.6-35B-A3B-MTP-GGUF",
|
||||
"provider": "unsloth"
|
||||
}
|
||||
],
|
||||
"capabilities": [
|
||||
"mtp"
|
||||
],
|
||||
"_discovered": true
|
||||
},
|
||||
{
|
||||
"name": "Qwen/Qwen3.5-0.8B-MTP",
|
||||
"provider": "Qwen",
|
||||
"parameter_count": "873M",
|
||||
"parameters_raw": 873438784,
|
||||
"min_ram_gb": 1.0,
|
||||
"recommended_ram_gb": 2.0,
|
||||
"min_vram_gb": 0.5,
|
||||
"quantization": "Q4_K_M",
|
||||
"context_length": 262144,
|
||||
"use_case": "General purpose, MTP",
|
||||
"capabilities": [
|
||||
"mtp",
|
||||
"tool_use",
|
||||
"vision"
|
||||
],
|
||||
"pipeline_tag": "image-text-to-text",
|
||||
"architecture": "qwen3_5",
|
||||
"hf_downloads": 93448,
|
||||
"hf_likes": 208,
|
||||
"release_date": "2026-02-28",
|
||||
"gguf_sources": [
|
||||
{
|
||||
"repo": "unsloth/Qwen3.5-0.8B-MTP-GGUF",
|
||||
"provider": "unsloth"
|
||||
}
|
||||
],
|
||||
"_discovered": true
|
||||
},
|
||||
{
|
||||
"name": "Qwen/Qwen3.5-2B-MTP",
|
||||
"provider": "Qwen",
|
||||
"parameter_count": "2.3B",
|
||||
"parameters_raw": 2274069824,
|
||||
"min_ram_gb": 1.3,
|
||||
"recommended_ram_gb": 2.1,
|
||||
"min_vram_gb": 1.2,
|
||||
"quantization": "Q4_K_M",
|
||||
"context_length": 262144,
|
||||
"use_case": "General purpose, MTP",
|
||||
"capabilities": [
|
||||
"mtp",
|
||||
"tool_use",
|
||||
"vision"
|
||||
],
|
||||
"pipeline_tag": "image-text-to-text",
|
||||
"architecture": "qwen3_5",
|
||||
"hf_downloads": 46974,
|
||||
"hf_likes": 115,
|
||||
"release_date": "2026-02-28",
|
||||
"gguf_sources": [
|
||||
{
|
||||
"repo": "unsloth/Qwen3.5-2B-MTP-GGUF",
|
||||
"provider": "unsloth"
|
||||
}
|
||||
],
|
||||
"_discovered": true
|
||||
},
|
||||
{
|
||||
"name": "Qwen/Qwen3.5-4B-MTP",
|
||||
"provider": "Qwen",
|
||||
"parameter_count": "4.7B",
|
||||
"parameters_raw": 4659865088,
|
||||
"min_ram_gb": 2.6,
|
||||
"recommended_ram_gb": 4.3,
|
||||
"min_vram_gb": 2.4,
|
||||
"quantization": "Q4_K_M",
|
||||
"context_length": 262144,
|
||||
"use_case": "General purpose, MTP",
|
||||
"capabilities": [
|
||||
"mtp",
|
||||
"tool_use",
|
||||
"vision"
|
||||
],
|
||||
"pipeline_tag": "image-text-to-text",
|
||||
"architecture": "qwen3_5",
|
||||
"hf_downloads": 99087,
|
||||
"hf_likes": 202,
|
||||
"release_date": "2026-02-27",
|
||||
"gguf_sources": [
|
||||
{
|
||||
"repo": "unsloth/Qwen3.5-4B-MTP-GGUF",
|
||||
"provider": "unsloth"
|
||||
}
|
||||
],
|
||||
"_discovered": true
|
||||
},
|
||||
{
|
||||
"name": "Qwen/Qwen3.5-9B-MTP",
|
||||
"provider": "Qwen",
|
||||
"parameter_count": "9.7B",
|
||||
"parameters_raw": 9653104368,
|
||||
"min_ram_gb": 5.4,
|
||||
"recommended_ram_gb": 9.0,
|
||||
"min_vram_gb": 4.9,
|
||||
"quantization": "Q4_K_M",
|
||||
"context_length": 262144,
|
||||
"use_case": "General purpose, MTP",
|
||||
"capabilities": [
|
||||
"mtp",
|
||||
"tool_use",
|
||||
"vision"
|
||||
],
|
||||
"pipeline_tag": "image-text-to-text",
|
||||
"architecture": "qwen3_5",
|
||||
"hf_downloads": 172298,
|
||||
"hf_likes": 345,
|
||||
"release_date": "2026-02-27",
|
||||
"gguf_sources": [
|
||||
{
|
||||
"repo": "unsloth/Qwen3.5-9B-MTP-GGUF",
|
||||
"provider": "unsloth"
|
||||
}
|
||||
],
|
||||
"_discovered": true
|
||||
},
|
||||
{
|
||||
"name": "Qwen/Qwen3.5-27B-MTP",
|
||||
"provider": "Qwen",
|
||||
"parameter_count": "27.8B",
|
||||
"parameters_raw": 27781427952,
|
||||
"min_ram_gb": 15.5,
|
||||
"recommended_ram_gb": 25.9,
|
||||
"min_vram_gb": 14.2,
|
||||
"quantization": "Q4_K_M",
|
||||
"context_length": 262144,
|
||||
"use_case": "General purpose, MTP",
|
||||
"capabilities": [
|
||||
"mtp",
|
||||
"tool_use",
|
||||
"vision"
|
||||
],
|
||||
"pipeline_tag": "image-text-to-text",
|
||||
"architecture": "qwen3_5",
|
||||
"hf_downloads": 406808,
|
||||
"hf_likes": 565,
|
||||
"release_date": "2026-02-24",
|
||||
"gguf_sources": [
|
||||
{
|
||||
"repo": "unsloth/Qwen3.5-27B-MTP-GGUF",
|
||||
"provider": "unsloth"
|
||||
}
|
||||
],
|
||||
"_discovered": true
|
||||
},
|
||||
{
|
||||
"name": "Qwen/Qwen3.5-35B-A3B-MTP",
|
||||
"provider": "Qwen",
|
||||
"parameter_count": "36.0B",
|
||||
"parameters_raw": 35951822704,
|
||||
"min_ram_gb": 20.1,
|
||||
"recommended_ram_gb": 33.5,
|
||||
"min_vram_gb": 18.4,
|
||||
"quantization": "Q4_K_M",
|
||||
"context_length": 262144,
|
||||
"use_case": "General purpose, MTP",
|
||||
"capabilities": [
|
||||
"mtp",
|
||||
"tool_use",
|
||||
"vision"
|
||||
],
|
||||
"pipeline_tag": "image-text-to-text",
|
||||
"architecture": "qwen3_5_moe",
|
||||
"hf_downloads": 769032,
|
||||
"hf_likes": 905,
|
||||
"release_date": "2026-02-24",
|
||||
"is_moe": true,
|
||||
"num_experts": 256,
|
||||
"active_experts": 8,
|
||||
"active_parameters": 3000000000,
|
||||
"gguf_sources": [
|
||||
{
|
||||
"repo": "unsloth/Qwen3.5-35B-A3B-MTP-GGUF",
|
||||
"provider": "unsloth"
|
||||
}
|
||||
],
|
||||
"_discovered": true
|
||||
},
|
||||
{
|
||||
"name": "Qwen/Qwen3.5-122B-A10B-MTP",
|
||||
"provider": "Qwen",
|
||||
"parameter_count": "125.1B",
|
||||
"parameters_raw": 125086497008,
|
||||
"min_ram_gb": 69.9,
|
||||
"recommended_ram_gb": 116.5,
|
||||
"min_vram_gb": 64.1,
|
||||
"quantization": "Q4_K_M",
|
||||
"context_length": 262144,
|
||||
"use_case": "General purpose, MTP",
|
||||
"capabilities": [
|
||||
"mtp",
|
||||
"tool_use",
|
||||
"vision"
|
||||
],
|
||||
"pipeline_tag": "image-text-to-text",
|
||||
"architecture": "qwen3_5_moe",
|
||||
"hf_downloads": 171055,
|
||||
"hf_likes": 389,
|
||||
"release_date": "2026-02-24",
|
||||
"is_moe": true,
|
||||
"num_experts": 256,
|
||||
"active_experts": 8,
|
||||
"active_parameters": 10000000000,
|
||||
"gguf_sources": [
|
||||
{
|
||||
"repo": "unsloth/Qwen3.5-122B-A10B-MTP-GGUF",
|
||||
"provider": "unsloth"
|
||||
}
|
||||
],
|
||||
"_discovered": true
|
||||
},
|
||||
{
|
||||
"name": "Qwen/Qwen3.5-397B-A17B-MTP",
|
||||
"provider": "Qwen",
|
||||
"parameter_count": "403.4B",
|
||||
"parameters_raw": 403397928944,
|
||||
"min_ram_gb": 225.4,
|
||||
"recommended_ram_gb": 375.7,
|
||||
"min_vram_gb": 206.6,
|
||||
"quantization": "Q4_K_M",
|
||||
"context_length": 262144,
|
||||
"use_case": "General purpose, MTP",
|
||||
"capabilities": [
|
||||
"mtp",
|
||||
"tool_use",
|
||||
"vision"
|
||||
],
|
||||
"pipeline_tag": "image-text-to-text",
|
||||
"architecture": "qwen3_5_moe",
|
||||
"hf_downloads": 1291825,
|
||||
"hf_likes": 1214,
|
||||
"release_date": "2026-02-16",
|
||||
"is_moe": true,
|
||||
"num_experts": 256,
|
||||
"active_experts": 8,
|
||||
"active_parameters": 17000000000,
|
||||
"gguf_sources": [
|
||||
{
|
||||
"repo": "unsloth/Qwen3.5-397B-A17B-MTP-GGUF",
|
||||
"provider": "unsloth"
|
||||
}
|
||||
],
|
||||
"_discovered": true
|
||||
}
|
||||
]
|
||||
|
||||
+60
-21
@@ -99,6 +99,27 @@ def _estimate_speed(model, quant, run_mode, system):
|
||||
return k / pb * sm
|
||||
|
||||
|
||||
def _architecture_bonus(model):
|
||||
name = (model.get("name") or "").lower()
|
||||
arch = (model.get("architecture") or "").lower()
|
||||
text = f"{name} {arch}"
|
||||
|
||||
# Keep this intentionally small: hardware fit and speed still matter, but
|
||||
# current model families should not be scored the same as older Qwen2/LLama
|
||||
# era entries just because the parameter count is similar.
|
||||
if "qwen3.6" in text or "qwen3_6" in text:
|
||||
return 9
|
||||
if "qwen3.5" in text or "qwen3_5" in text:
|
||||
return 8
|
||||
if "qwen3-next" in text or "qwen3_next" in text:
|
||||
return 6
|
||||
if "qwen3" in text or arch.startswith("qwen3"):
|
||||
return 4
|
||||
if "qwen2.5" in text or "qwen2_5" in text:
|
||||
return 2
|
||||
return 0
|
||||
|
||||
|
||||
def _quality_score(model, quant, use_case):
|
||||
pb = params_b(model)
|
||||
if pb < 1:
|
||||
@@ -128,6 +149,7 @@ def _quality_score(model, quant, use_case):
|
||||
if "gemma" in name_lower:
|
||||
base += 1
|
||||
|
||||
base += _architecture_bonus(model)
|
||||
base += QUANT_QUALITY_PENALTY.get(quant, 0)
|
||||
|
||||
model_uc = infer_use_case(model)
|
||||
@@ -220,12 +242,13 @@ def _quant_bits(q):
|
||||
return 0
|
||||
|
||||
|
||||
def analyze_model(model, system, target_quant=None):
|
||||
def analyze_model(model, system, target_quant=None, scoring_use_case=None):
|
||||
pb = params_b(model)
|
||||
if pb <= 0:
|
||||
return None
|
||||
|
||||
use_case = infer_use_case(model)
|
||||
model_use_case = infer_use_case(model)
|
||||
score_use_case = scoring_use_case or "general"
|
||||
has_gpu = system.get("has_gpu", False)
|
||||
gpu_vram = (system.get("gpu_vram_gb") or 0) if has_gpu else 0
|
||||
gpu_count = system.get("gpu_count", 1) or 1
|
||||
@@ -242,6 +265,8 @@ def analyze_model(model, system, target_quant=None):
|
||||
ctx = model.get("context_length", 4096) or 4096
|
||||
|
||||
native_quant = model.get("quantization", "Q4_K_M")
|
||||
if "nvfp4" in (model.get("name") or "").lower():
|
||||
native_quant = "NVFP4"
|
||||
preq = is_prequantized(model)
|
||||
|
||||
# GGUF models can't be sharded across GPUs — use single GPU VRAM
|
||||
@@ -260,10 +285,13 @@ def analyze_model(model, system, target_quant=None):
|
||||
# Determine which quant to evaluate at
|
||||
if preq:
|
||||
# AWQ/GPTQ/FP8/MLX come at a fixed bit-width. If the user picked a
|
||||
# specific quant tier (e.g. Q8 → 8-bit), only keep prequant models whose
|
||||
# native bit-width matches — otherwise selecting Q8 would still surface
|
||||
# AWQ-4bit models, mixing 4- and 8-bit in one view.
|
||||
# GGUF quant tier (Q4/Q8/etc.), do not treat a same-bit AWQ/GPTQ build
|
||||
# as equivalent. "Q4" means llama.cpp/Ollama-style GGUF in this UI;
|
||||
# AWQ/GPTQ/FP8 are separate GPU-serving formats and must only appear
|
||||
# when explicitly selected or when no quant filter is applied.
|
||||
if target_quant:
|
||||
if not any(target_quant.startswith(p) for p in ("AWQ-", "GPTQ-", "FP8", "NVFP4")):
|
||||
return None
|
||||
_tb, _nb = _quant_bits(target_quant), _quant_bits(native_quant)
|
||||
if _tb and _nb and _tb != _nb:
|
||||
return None
|
||||
@@ -300,7 +328,7 @@ def analyze_model(model, system, target_quant=None):
|
||||
"parameter_count": model.get("parameter_count"),
|
||||
"params_b": round(pb, 1),
|
||||
"is_moe": is_moe,
|
||||
"use_case": use_case,
|
||||
"use_case": model_use_case,
|
||||
"fit_level": "too_tight",
|
||||
"run_mode": "no_fit",
|
||||
"quant": quant_to_try,
|
||||
@@ -334,12 +362,12 @@ def analyze_model(model, system, target_quant=None):
|
||||
|
||||
tps = _estimate_speed(model, quant, run_mode, system)
|
||||
|
||||
q_score = _quality_score(model, quant, use_case)
|
||||
s_score = _speed_score(tps, use_case)
|
||||
q_score = _quality_score(model, quant, score_use_case)
|
||||
s_score = _speed_score(tps, score_use_case)
|
||||
f_score = _fit_score(required_gb, budget)
|
||||
c_score = _context_score(fit_ctx, use_case)
|
||||
c_score = _context_score(fit_ctx, score_use_case)
|
||||
|
||||
wq, ws, wf, wc = USE_CASE_WEIGHTS.get(use_case, (0.45, 0.30, 0.15, 0.10))
|
||||
wq, ws, wf, wc = USE_CASE_WEIGHTS.get(score_use_case, (0.45, 0.30, 0.15, 0.10))
|
||||
composite = q_score * wq + s_score * ws + f_score * wf + c_score * wc
|
||||
|
||||
return {
|
||||
@@ -348,7 +376,7 @@ def analyze_model(model, system, target_quant=None):
|
||||
"parameter_count": model.get("parameter_count"),
|
||||
"params_b": round(pb, 1),
|
||||
"is_moe": is_moe,
|
||||
"use_case": use_case,
|
||||
"use_case": model_use_case,
|
||||
"fit_level": fit_level,
|
||||
"run_mode": run_mode,
|
||||
"quant": quant,
|
||||
@@ -419,21 +447,29 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
|
||||
results.sort(key=sort_fn, reverse=(sort != "vram"))
|
||||
return results[:limit]
|
||||
|
||||
# If user picked a prequantized format (AWQ/FP8/GPTQ), filter to only those models
|
||||
filter_native = quant and any(quant.startswith(p) for p in ("AWQ-", "GPTQ-", "FP8"))
|
||||
# If user picked a prequantized format (AWQ/FP8/GPTQ/NVFP4), filter to only those models
|
||||
filter_native = quant and any(quant.startswith(p) for p in ("AWQ-", "GPTQ-", "FP8", "NVFP4"))
|
||||
|
||||
system_backend = (system.get("backend") or "").lower()
|
||||
apple_silicon = system_backend in ("mps", "metal", "apple")
|
||||
rocm = system_backend == "rocm"
|
||||
|
||||
for m in models:
|
||||
native_q = m.get("quantization", "")
|
||||
if "nvfp4" in (m.get("name") or "").lower():
|
||||
native_q = "NVFP4"
|
||||
|
||||
# MLX-quantized models need the MLX runtime (mlx_lm), which Odysseus
|
||||
# doesn't generate serve commands for — only llama.cpp/Ollama (Metal)
|
||||
# and vLLM/SGLang (CUDA). MLX repos ship no GGUF alternative, so they're
|
||||
# unrunnable on every backend we support. Always drop them, on Apple
|
||||
# Silicon too, so the Cookbook never recommends a model it can't serve.
|
||||
if native_q.startswith("mlx-"):
|
||||
# MLX is Apple Silicon only. Hide MLX rows on non-Mac hardware scans,
|
||||
# but leave them visible on Metal/MPS so Mac support is not broken.
|
||||
if not apple_silicon and (native_q.startswith("mlx-") or "mlx" in (m.get("name") or "").lower()):
|
||||
continue
|
||||
|
||||
# ROCm support for vLLM/SGLang quantized safetensors is too brittle to
|
||||
# recommend blindly in the default scan. Keep AWQ/GPTQ/FP8 discoverable
|
||||
# only when the user explicitly picks that format from the quant filter;
|
||||
# otherwise prefer GGUF/Q* entries that Odysseus can route through
|
||||
# llama.cpp/Ollama without pretending "fits VRAM" means "servable".
|
||||
if rocm and is_prequantized(m) and not filter_native:
|
||||
continue
|
||||
|
||||
# On Apple Silicon the only serving engines are llama.cpp and Ollama,
|
||||
@@ -443,7 +479,8 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
|
||||
# default GGUF quant) and vLLM-only AWQ/GPTQ/FP8 builds alike. Without
|
||||
# this the Cookbook recommends models the Mac can't run; on CUDA these
|
||||
# stay visible because vLLM serves safetensors directly.
|
||||
if apple_silicon and not (m.get("is_gguf") or m.get("gguf_sources")):
|
||||
is_mlx = native_q.startswith("mlx-") or "mlx" in (m.get("name") or "").lower()
|
||||
if apple_silicon and not (m.get("is_gguf") or m.get("gguf_sources") or is_mlx):
|
||||
continue
|
||||
|
||||
# Format filter: AWQ tab → only AWQ models, FP8 tab → only FP8 models
|
||||
@@ -454,6 +491,8 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
|
||||
continue
|
||||
if quant.startswith("GPTQ") and not native_q.startswith("GPTQ"):
|
||||
continue
|
||||
if quant.startswith("NVFP4") and not native_q.startswith("NVFP4"):
|
||||
continue
|
||||
|
||||
if search:
|
||||
name = m.get("name", "").lower()
|
||||
@@ -461,7 +500,7 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
|
||||
if search.lower() not in name and search.lower() not in provider:
|
||||
continue
|
||||
|
||||
result = analyze_model(m, system, target_quant=quant)
|
||||
result = analyze_model(m, system, target_quant=quant, scoring_use_case=(use_case or "general"))
|
||||
if result is None:
|
||||
continue
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ import re
|
||||
QUANT_HIERARCHY = ["Q8_0", "Q6_K", "Q5_K_M", "Q4_K_M", "Q3_K_M", "Q2_K"]
|
||||
|
||||
QUANT_BPP = {
|
||||
"F32": 4.0, "F16": 2.0, "BF16": 2.0, "FP8": 1.0,
|
||||
"F32": 4.0, "F16": 2.0, "BF16": 2.0, "FP8": 1.0, "NVFP4": 0.5,
|
||||
"Q8_0": 1.05, "Q6_K": 0.80, "Q5_K_M": 0.68,
|
||||
"Q4_K_M": 0.58, "Q4_0": 0.58, "Q3_K_M": 0.48, "Q2_K": 0.37,
|
||||
"AWQ-4bit": 0.50, "AWQ-8bit": 1.0,
|
||||
@@ -14,7 +14,7 @@ QUANT_BPP = {
|
||||
}
|
||||
|
||||
QUANT_SPEED_MULT = {
|
||||
"F16": 0.6, "BF16": 0.6, "FP8": 0.85,
|
||||
"F16": 0.6, "BF16": 0.6, "FP8": 0.85, "NVFP4": 1.1,
|
||||
"Q8_0": 0.8, "Q6_K": 0.95, "Q5_K_M": 1.0,
|
||||
"Q4_K_M": 1.15, "Q4_0": 1.15, "Q3_K_M": 1.25, "Q2_K": 1.35,
|
||||
"AWQ-4bit": 1.2, "AWQ-8bit": 0.85,
|
||||
@@ -23,7 +23,7 @@ QUANT_SPEED_MULT = {
|
||||
}
|
||||
|
||||
QUANT_QUALITY_PENALTY = {
|
||||
"F16": 0.0, "BF16": 0.0, "FP8": 0.0,
|
||||
"F16": 0.0, "BF16": 0.0, "FP8": 0.0, "NVFP4": 0.0,
|
||||
"Q8_0": 0.0, "Q6_K": -1.0, "Q5_K_M": -2.0,
|
||||
"Q4_K_M": -5.0, "Q4_0": -5.0, "Q3_K_M": -8.0, "Q2_K": -12.0,
|
||||
"AWQ-4bit": -3.0, "AWQ-8bit": 0.0,
|
||||
@@ -32,7 +32,7 @@ QUANT_QUALITY_PENALTY = {
|
||||
}
|
||||
|
||||
QUANT_BYTES_PER_PARAM = {
|
||||
"F16": 2.0, "BF16": 2.0, "FP8": 1.0,
|
||||
"F16": 2.0, "BF16": 2.0, "FP8": 1.0, "NVFP4": 0.5,
|
||||
"Q8_0": 1.0, "Q6_K": 0.75, "Q5_K_M": 0.625,
|
||||
"Q4_K_M": 0.5, "Q4_0": 0.5, "Q3_K_M": 0.375, "Q2_K": 0.25,
|
||||
"AWQ-4bit": 0.5, "AWQ-8bit": 1.0,
|
||||
@@ -41,12 +41,13 @@ QUANT_BYTES_PER_PARAM = {
|
||||
}
|
||||
|
||||
# Pre-quantized formats that should NOT go through the GGUF quant hierarchy
|
||||
PREQUANTIZED_PREFIXES = ("AWQ-", "GPTQ-", "mlx-", "FP8")
|
||||
PREQUANTIZED_PREFIXES = ("AWQ-", "GPTQ-", "mlx-", "FP8", "NVFP4")
|
||||
|
||||
|
||||
def is_prequantized(model):
|
||||
q = model.get("quantization", "")
|
||||
return any(q.startswith(p) for p in PREQUANTIZED_PREFIXES)
|
||||
name = (model.get("name") or "").lower()
|
||||
return "nvfp4" in name or any(q.startswith(p) for p in PREQUANTIZED_PREFIXES)
|
||||
|
||||
|
||||
def params_b(model):
|
||||
|
||||
Reference in New Issue
Block a user