Fix native Cookbook quant classification

2026-06-17 10:15:27 -04:00 · 2026-06-02 14:07:20 +10:00
parent 65b5d65059
commit cd4f496cb4
6 changed files with 201 additions and 44 deletions
@@ -5,7 +5,9 @@ import re
 QUANT_HIERARCHY = ["Q8_0", "Q6_K", "Q5_K_M", "Q4_K_M", "Q3_K_M", "Q2_K"]

 QUANT_BPP = {
-    "F32": 4.0, "F16": 2.0, "BF16": 2.0, "FP8": 1.0, "NVFP4": 0.5,
+    "F32": 4.0, "F16": 2.0, "BF16": 2.0, "FP8": 1.0,
+    "FP4": 0.50, "NVFP4": 0.50, "MXFP4": 0.50, "NF4": 0.50,
+    "INT4": 0.50, "INT8": 1.0, "W4A16": 0.50, "W8A8": 1.0, "W8A16": 1.0,
    "Q8_0": 1.05, "Q6_K": 0.80, "Q5_K_M": 0.68,
    "Q4_K_M": 0.58, "Q4_0": 0.58, "Q3_K_M": 0.48, "Q2_K": 0.37,
    "AWQ-4bit": 0.50, "AWQ-8bit": 1.0,
@@ -14,7 +16,9 @@ QUANT_BPP = {
 }

 QUANT_SPEED_MULT = {
-    "F16": 0.6, "BF16": 0.6, "FP8": 0.85, "NVFP4": 1.1,
+    "F16": 0.6, "BF16": 0.6, "FP8": 0.85,
+    "FP4": 1.15, "NVFP4": 1.15, "MXFP4": 1.15, "NF4": 1.10,
+    "INT4": 1.15, "INT8": 0.85, "W4A16": 1.15, "W8A8": 0.85, "W8A16": 0.85,
    "Q8_0": 0.8, "Q6_K": 0.95, "Q5_K_M": 1.0,
    "Q4_K_M": 1.15, "Q4_0": 1.15, "Q3_K_M": 1.25, "Q2_K": 1.35,
    "AWQ-4bit": 1.2, "AWQ-8bit": 0.85,
@@ -23,7 +27,9 @@ QUANT_SPEED_MULT = {
 }

 QUANT_QUALITY_PENALTY = {
-    "F16": 0.0, "BF16": 0.0, "FP8": 0.0, "NVFP4": 0.0,
+    "F16": 0.0, "BF16": 0.0, "FP8": 0.0,
+    "FP4": -3.0, "NVFP4": -3.0, "MXFP4": -3.0, "NF4": -4.0,
+    "INT4": -4.0, "INT8": 0.0, "W4A16": -4.0, "W8A8": 0.0, "W8A16": 0.0,
    "Q8_0": 0.0, "Q6_K": -1.0, "Q5_K_M": -2.0,
    "Q4_K_M": -5.0, "Q4_0": -5.0, "Q3_K_M": -8.0, "Q2_K": -12.0,
    "AWQ-4bit": -3.0, "AWQ-8bit": 0.0,
@@ -32,7 +38,9 @@ QUANT_QUALITY_PENALTY = {
 }

 QUANT_BYTES_PER_PARAM = {
-    "F16": 2.0, "BF16": 2.0, "FP8": 1.0, "NVFP4": 0.5,
+    "F16": 2.0, "BF16": 2.0, "FP8": 1.0,
+    "FP4": 0.5, "NVFP4": 0.5, "MXFP4": 0.5, "NF4": 0.5,
+    "INT4": 0.5, "INT8": 1.0, "W4A16": 0.5, "W8A8": 1.0, "W8A16": 1.0,
    "Q8_0": 1.0, "Q6_K": 0.75, "Q5_K_M": 0.625,
    "Q4_K_M": 0.5, "Q4_0": 0.5, "Q3_K_M": 0.375, "Q2_K": 0.25,
    "AWQ-4bit": 0.5, "AWQ-8bit": 1.0,
@@ -40,14 +48,60 @@ QUANT_BYTES_PER_PARAM = {
    "mlx-4bit": 0.5, "mlx-8bit": 1.0, "mlx-6bit": 0.75,
 }

-# Pre-quantized formats that should NOT go through the GGUF quant hierarchy
-PREQUANTIZED_PREFIXES = ("AWQ-", "GPTQ-", "mlx-", "FP8", "NVFP4")
+# Pre-quantized formats that should NOT go through the GGUF quant hierarchy.
+# These are native HF/vLLM-style repos, not llama.cpp GGUF quant tiers.
+PREQUANTIZED_PREFIXES = (
+    "AWQ-", "GPTQ-", "mlx-", "FP8", "FP4", "NVFP4", "MXFP4", "NF4",
+    "INT4", "INT8", "W4A16", "W8A8", "W8A16",
+)
+
+
+def infer_quantization_from_name(name):
+    n = (name or "").lower()
+    if "nvfp4" in n:
+        return "NVFP4"
+    if "mxfp4" in n:
+        return "MXFP4"
+    if re.search(r"(^|[-_/])nf4($|[-_/])", n):
+        return "NF4"
+    if re.search(r"(^|[-_/])fp4($|[-_/])", n):
+        return "FP4"
+    if re.search(r"(^|[-_/])w4a16($|[-_/])", n):
+        return "W4A16"
+    if re.search(r"(^|[-_/])w8a8($|[-_/])", n):
+        return "W8A8"
+    if re.search(r"(^|[-_/])w8a16($|[-_/])", n):
+        return "W8A16"
+    is8 = "8bit" in n or "8-bit" in n or "int8" in n
+    if "awq" in n:
+        return "AWQ-8bit" if is8 else "AWQ-4bit"
+    if "gptq" in n:
+        return "GPTQ-Int8" if is8 else "GPTQ-Int4"
+    if "mlx" in n:
+        if "6bit" in n:
+            return "mlx-6bit"
+        return "mlx-8bit" if is8 else "mlx-4bit"
+    if "fp8" in n:
+        return "FP8"
+    if "int4" in n or "4bit" in n or "4-bit" in n:
+        return "INT4"
+    if "int8" in n or "8bit" in n or "8-bit" in n:
+        return "INT8"
+    return ""
+
+
+def _normalize_model_entry(model):
+    if not isinstance(model, dict):
+        return model
+    inferred = infer_quantization_from_name(model.get("name", ""))
+    if inferred and (model.get("quantization") in (None, "", "Q4_K_M") or model.get("_discovered")):
+        model["quantization"] = inferred
+    return model


 def is_prequantized(model):
    q = model.get("quantization", "")
-    name = (model.get("name") or "").lower()
-    return "nvfp4" in name or any(q.startswith(p) for p in PREQUANTIZED_PREFIXES)
+    return any(q.startswith(p) for p in PREQUANTIZED_PREFIXES)


 def params_b(model):
@@ -168,7 +222,7 @@ def get_models():
        data_path = os.path.join(os.path.dirname(__file__), "data", "hf_models.json")
        try:
            with open(data_path, encoding="utf-8") as f:
-                _models_cache = json.load(f)
+                _models_cache = [_normalize_model_entry(m) for m in json.load(f)]
        except (FileNotFoundError, json.JSONDecodeError):
            _models_cache = []
    return _models_cache