Fix native Cookbook quant classification

2026-06-16 17:55:26 -04:00 · 2026-06-02 14:07:20 +10:00
parent 65b5d65059
commit cd4f496cb4
6 changed files with 201 additions and 44 deletions
@@ -0,0 +1,78 @@
+from services.hwfit.fit import analyze_model, rank_models
+from services.hwfit.models import (
+    get_models,
+    infer_quantization_from_name,
+    is_prequantized,
+)
+
+
+def _dual_5060ti_system():
+    return {
+        "has_gpu": True,
+        "backend": "cuda",
+        "gpu_name": "NVIDIA GeForce RTX 5060 Ti",
+        "gpu_vram_gb": 31.0,
+        "gpu_count": 2,
+        "available_ram_gb": 128.0,
+        "total_ram_gb": 128.0,
+    }
+
+
+def test_infers_native_hf_quant_formats_from_repo_names():
+    cases = {
+        "txn545/Qwen3.5-122B-A10B-NVFP4": "NVFP4",
+        "some/model-MXFP4": "MXFP4",
+        "some/model-NF4": "NF4",
+        "some/model-FP4": "FP4",
+        "some/model-W4A16": "W4A16",
+        "some/model-W8A8": "W8A8",
+        "some/model-W8A16": "W8A16",
+        "some/model-INT4": "INT4",
+        "some/model-8bit": "INT8",
+    }
+    assert {name: infer_quantization_from_name(name) for name in cases} == cases
+
+
+def test_nvfp4_catalog_quant_is_preserved():
+    catalog = {m["name"]: m for m in get_models()}
+    model = catalog["txn545/Qwen3.5-122B-A10B-NVFP4"]
+
+    assert model["quantization"] == "NVFP4"
+    assert is_prequantized(model)
+
+
+def test_nvfp4_search_result_is_not_gguf_or_cpu_offload():
+    catalog = {m["name"]: m for m in get_models()}
+    model = catalog["txn545/Qwen3.5-122B-A10B-NVFP4"]
+
+    fit = analyze_model(model, _dual_5060ti_system())
+    assert fit["quant"] == "NVFP4"
+    assert fit["run_mode"] != "cpu_offload"
+
+    results = rank_models(
+        _dual_5060ti_system(),
+        search="Qwen3.5-122B-A10B-NVFP4",
+        limit=10,
+    )
+    hit = next(r for r in results if r["name"] == "txn545/Qwen3.5-122B-A10B-NVFP4")
+    assert hit["quant"] == "NVFP4"
+    assert hit["run_mode"] != "cpu_offload"
+
+
+def test_selected_gguf_quant_is_strict_not_lower_quant_fallback():
+    model = {
+        "name": "local/Huge-GGUF",
+        "provider": "local",
+        "parameter_count": "100B",
+        "parameters_raw": 100_000_000_000,
+        "quantization": "Q4_K_M",
+        "context_length": 4096,
+    }
+
+    system = _dual_5060ti_system()
+    system["available_ram_gb"] = 80.0
+    system["total_ram_gb"] = 80.0
+    fit = analyze_model(model, system, target_quant="Q8_0")
+
+    assert fit["quant"] == "Q8_0"
+    assert fit["run_mode"] == "no_fit"