Fix VRAM estimates for pre-quantized HF repos

The Cookbook fit scanner was reporting impossibly low VRAM requirements for some pre-quantized models — e.g. cyankiwi/Qwen3-Coder-Next-REAM-AWQ-4bit shown as 7.1 GB ('perfect' on a 12 GB card) when the real load is ~40 GB. Root cause is in the catalog builder. When _entry_from_modelinfo falls back to safetensors metadata for the parameter count, it stored safetensors.total directly. For pre-quantized repos that figure reflects *packed* element counts: AWQ/GPTQ-Int4 pack 8x 4-bit weights into one I32, AWQ-8bit/GPTQ-Int8/FP8 pack 4x. The catalog therefore recorded ~1/8 of the real parameter count, and min_vram_gb = packed * bpp double-applied the quantization. Fix the safetensors fallback: * prefer the per-dtype parameters dict when available and unpack only the I32/I64 entries (the F16/BF16 scale/zero tensors and embeddings are already at their real element counts) * fall back to total * pack_factor when only total is exposed Patch the catalog entries that were affected by the old fallback so the fit ratings reflect reality without waiting for a full catalog rebuild: * cyankiwi/Qwen3-Coder-Next-REAM-AWQ-4bit 11.4B -> 79.7B (40.8 GB VRAM) * stelterlab/Qwen3-Coder-30B-A3B-Instruct-AWQ 4.6B -> 30.5B * stelterlab/NVIDIA-Nemotron-3-Nano-30B-A3B-AWQ 5.1B -> 30.5B * warshanks/Qwen3-8B-abliterated-AWQ 2.2B -> 8.2B * QuantTrio/sarvam-30b-AWQ 7B -> 30B * QuantTrio/sarvam-105b-AWQ 19B -> 105B Closes #377.
2026-06-17 02:05:22 -04:00 · 2026-06-01 19:32:58 +10:00
parent 16d6484492
commit 9955f5bc95
2 changed files with 68 additions and 39 deletions
@@ -3350,11 +3350,11 @@
 {
  "name": "warshanks/Qwen3-8B-abliterated-AWQ",
  "provider": "warshanks",
-  "parameter_count": "2.2B",
-  "parameters_raw": 2174236152,
-  "min_ram_gb": 1.2,
-  "recommended_ram_gb": 2.0,
-  "min_vram_gb": 1.1,
+  "parameter_count": "8.2B",
+  "parameters_raw": 8190735872,
+  "min_ram_gb": 3.2,
+  "recommended_ram_gb": 6.4,
+  "min_vram_gb": 5.3,
  "quantization": "AWQ-4bit",
  "context_length": 40960,
  "use_case": "General purpose text generation",
@@ -4564,11 +4564,11 @@
 {
  "name": "stelterlab/Qwen3-Coder-30B-A3B-Instruct-AWQ",
  "provider": "stelterlab",
-  "parameter_count": "4.6B",
-  "parameters_raw": 4605856128,
-  "min_ram_gb": 2.6,
-  "recommended_ram_gb": 4.3,
-  "min_vram_gb": 2.4,
+  "parameter_count": "30.5B",
+  "parameters_raw": 30532122624,
+  "min_ram_gb": 10.9,
+  "recommended_ram_gb": 21.8,
+  "min_vram_gb": 18.2,
  "quantization": "AWQ-4bit",
  "context_length": 262144,
  "use_case": "Code generation and completion",
@@ -4583,7 +4583,7 @@
  "is_moe": true,
  "num_experts": 128,
  "active_experts": 8,
-  "active_parameters": 503765510,
+  "active_parameters": 3300000000,
  "_discovered": true,
  "format": "awq"
 },
@@ -4697,11 +4697,11 @@
 {
  "name": "stelterlab/NVIDIA-Nemotron-3-Nano-30B-A3B-AWQ",
  "provider": "stelterlab",
-  "parameter_count": "5.1B",
-  "parameters_raw": 5053827112,
-  "min_ram_gb": 2.8,
-  "recommended_ram_gb": 4.7,
-  "min_vram_gb": 2.6,
+  "parameter_count": "30.5B",
+  "parameters_raw": 30532122624,
+  "min_ram_gb": 10.9,
+  "recommended_ram_gb": 21.8,
+  "min_vram_gb": 18.2,
  "quantization": "AWQ-4bit",
  "context_length": 262144,
  "use_case": "General purpose text generation",
@@ -4712,7 +4712,11 @@
  "hf_likes": 4,
  "release_date": "2026-01-31",
  "_discovered": true,
-  "format": "awq"
+  "format": "awq",
+  "is_moe": true,
+  "num_experts": 128,
+  "active_experts": 8,
+  "active_parameters": 3300000000
 },
 {
  "name": "lmstudio-community/Qwen3-32B-MLX-4bit",
@@ -12586,11 +12590,11 @@
 {
  "name": "QuantTrio/sarvam-30b-AWQ",
  "provider": "QuantTrio",
-  "parameter_count": "7.0B",
-  "parameters_raw": 7000000000,
-  "min_ram_gb": 4.0,
-  "recommended_ram_gb": 5.2,
-  "min_vram_gb": 4.0,
+  "parameter_count": "30.0B",
+  "parameters_raw": 30000000000,
+  "min_ram_gb": 10.7,
+  "recommended_ram_gb": 21.5,
+  "min_vram_gb": 17.9,
  "quantization": "AWQ-4bit",
  "context_length": 131072,
  "use_case": "Chat, multilingual",
@@ -12605,11 +12609,11 @@
 {
  "name": "QuantTrio/sarvam-105b-AWQ",
  "provider": "QuantTrio",
-  "parameter_count": "19.0B",
-  "parameters_raw": 19000000000,
-  "min_ram_gb": 10.0,
-  "recommended_ram_gb": 13.0,
-  "min_vram_gb": 10.0,
+  "parameter_count": "105.0B",
+  "parameters_raw": 105000000000,
+  "min_ram_gb": 36.8,
+  "recommended_ram_gb": 73.7,
+  "min_vram_gb": 61.4,
  "quantization": "AWQ-4bit",
  "context_length": 131072,
  "use_case": "Chat, multilingual",
@@ -17884,21 +17888,26 @@
 {
  "name": "cyankiwi/Qwen3-Coder-Next-REAM-AWQ-4bit",
  "provider": "cyankiwi",
-  "parameter_count": "11.4B",
-  "parameters_raw": 11412204288,
-  "min_ram_gb": 4.3,
-  "recommended_ram_gb": 8.5,
-  "min_vram_gb": 7.1,
+  "parameter_count": "79.7B",
+  "parameters_raw": 79674391296,
+  "min_ram_gb": 22.3,
+  "recommended_ram_gb": 44.6,
+  "min_vram_gb": 40.8,
  "quantization": "AWQ-4bit",
  "context_length": 32768,
-  "use_case": "General purpose",
+  "use_case": "Coding",
  "capabilities": [],
  "pipeline_tag": "text-generation",
  "architecture": "qwen3_next",
  "hf_downloads": 695,
  "hf_likes": 10,
  "release_date": "2026-02-19",
-  "_discovered": true
+  "is_moe": true,
+  "num_experts": 512,
+  "active_experts": 10,
+  "active_parameters": null,
+  "_discovered": true,
+  "format": "awq"
 },
 {
  "name": "cyankiwi/INTELLECT-3.1-AWQ-8bit",