mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-23 13:15:29 -04:00
feat(catalog): add Gemma 4 12B/QAT entries and RTX 3050 bandwidth (#4728)
Add official Gemma 4 12B-it plus QAT-INT4/INT8 catalog entries (with their GGUF sources), QAT quantization support across the quant tables and the prequantized-prefix list, and the missing RTX 3050 / 3050 Ti memory bandwidth so speed estimates stop falling back to the generic cuda value.
This commit is contained in:
committed by
GitHub
parent
8f5e36a079
commit
119228a6db
@@ -14059,6 +14059,138 @@
|
||||
"vision"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "google/gemma-4-12B-it",
|
||||
"provider": "Google",
|
||||
"parameter_count": "12.0B",
|
||||
"parameters_raw": 12000000000,
|
||||
"min_ram_gb": 8.5,
|
||||
"recommended_ram_gb": 11.0,
|
||||
"min_vram_gb": 7.5,
|
||||
"quantization": "Q4_K_M",
|
||||
"context_length": 131072,
|
||||
"use_case": "General purpose, multimodal; unsloth/gemma-4-12B-it-GGUF Dynamic variants reduce VRAM from ~7.5 GB to ~5.5 GB",
|
||||
"is_moe": false,
|
||||
"num_experts": null,
|
||||
"active_experts": null,
|
||||
"active_parameters": null,
|
||||
"architecture": "gemma4",
|
||||
"pipeline_tag": "image-text-to-text",
|
||||
"release_date": "2026-04-01",
|
||||
"gguf_sources": [
|
||||
{
|
||||
"repo": "unsloth/gemma-4-12B-it-GGUF",
|
||||
"provider": "unsloth"
|
||||
}
|
||||
],
|
||||
"capabilities": [
|
||||
"vision"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "google/gemma-4-12B-it-qat-int4",
|
||||
"provider": "Google",
|
||||
"parameter_count": "12.0B",
|
||||
"parameters_raw": 12000000000,
|
||||
"min_ram_gb": 8.0,
|
||||
"recommended_ram_gb": 9.5,
|
||||
"min_vram_gb": 6.5,
|
||||
"quantization": "QAT-INT4",
|
||||
"context_length": 131072,
|
||||
"use_case": "General purpose, multimodal (QAT quantization-aware training — higher quality than post-train INT4; vLLM native; no GGUF)",
|
||||
"is_moe": false,
|
||||
"num_experts": null,
|
||||
"active_experts": null,
|
||||
"active_parameters": null,
|
||||
"architecture": "gemma4",
|
||||
"pipeline_tag": "image-text-to-text",
|
||||
"release_date": "2026-04-01",
|
||||
"gguf_sources": [],
|
||||
"capabilities": [
|
||||
"vision"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "google/gemma-4-12B-it-qat-int8",
|
||||
"provider": "Google",
|
||||
"parameter_count": "12.0B",
|
||||
"parameters_raw": 12000000000,
|
||||
"min_ram_gb": 15.0,
|
||||
"recommended_ram_gb": 20.0,
|
||||
"min_vram_gb": 13.5,
|
||||
"quantization": "QAT-INT8",
|
||||
"context_length": 131072,
|
||||
"use_case": "General purpose, multimodal (QAT INT8 — highest quality, 2x VRAM of QAT-INT4; vLLM native; no GGUF)",
|
||||
"is_moe": false,
|
||||
"num_experts": null,
|
||||
"active_experts": null,
|
||||
"active_parameters": null,
|
||||
"architecture": "gemma4",
|
||||
"pipeline_tag": "image-text-to-text",
|
||||
"release_date": "2026-04-01",
|
||||
"gguf_sources": [],
|
||||
"capabilities": [
|
||||
"vision"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "google/gemma-4-12B-it-qat-q4_0-gguf",
|
||||
"provider": "Google",
|
||||
"parameter_count": "12.0B",
|
||||
"parameters_raw": 12000000000,
|
||||
"min_ram_gb": 8.5,
|
||||
"recommended_ram_gb": 11.0,
|
||||
"min_vram_gb": 7.5,
|
||||
"quantization": "QAT-INT4",
|
||||
"context_length": 262144,
|
||||
"use_case": "General purpose, multimodal (vision + audio); official Google QAT int4 GGUF — near-bf16 quality at int4 size, served on llama.cpp/Ollama with CPU offload",
|
||||
"is_moe": false,
|
||||
"num_experts": null,
|
||||
"active_experts": null,
|
||||
"active_parameters": null,
|
||||
"architecture": "gemma4",
|
||||
"pipeline_tag": "image-text-to-text",
|
||||
"release_date": "2026-04-01",
|
||||
"gguf_sources": [
|
||||
{
|
||||
"repo": "google/gemma-4-12B-it-qat-q4_0-gguf",
|
||||
"provider": "Google",
|
||||
"file": "gemma-4-12b-it-qat-q4_0.gguf"
|
||||
}
|
||||
],
|
||||
"capabilities": [
|
||||
"vision",
|
||||
"audio"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "google/gemma-4-26B-A4B-it-qat-q4_0-gguf",
|
||||
"provider": "Google",
|
||||
"parameter_count": "25.2B",
|
||||
"parameters_raw": 25200000000,
|
||||
"min_ram_gb": 14.4,
|
||||
"recommended_ram_gb": 18.0,
|
||||
"min_vram_gb": 14.4,
|
||||
"quantization": "QAT-INT4",
|
||||
"context_length": 262144,
|
||||
"use_case": "High-throughput, multimodal MoE (3.8B active); official Google QAT int4 GGUF — near-bf16 quality at int4 size, served on llama.cpp with CPU offload",
|
||||
"is_moe": true,
|
||||
"num_experts": null,
|
||||
"active_experts": null,
|
||||
"active_parameters": 3800000000,
|
||||
"architecture": "gemma4",
|
||||
"pipeline_tag": "image-text-to-text",
|
||||
"release_date": "2026-04-01",
|
||||
"gguf_sources": [
|
||||
{
|
||||
"repo": "google/gemma-4-26B-A4B-it-qat-q4_0-gguf",
|
||||
"provider": "Google"
|
||||
}
|
||||
],
|
||||
"capabilities": [
|
||||
"vision"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "google/gemma-4-31B-it",
|
||||
"provider": "Google",
|
||||
@@ -19144,4 +19276,4 @@
|
||||
],
|
||||
"_discovered": true
|
||||
}
|
||||
]
|
||||
]
|
||||
@@ -9,7 +9,7 @@ from services.hwfit.models import (
|
||||
GPU_BANDWIDTH = {
|
||||
"5090": 1792, "5080": 960, "5070 ti": 896, "5070": 672, "5060 ti": 448, "5060": 256,
|
||||
"4090": 1008, "4080 super": 736, "4080": 717, "4070 ti super": 672, "4070 ti": 504, "4070 super": 504, "4070": 504, "4060 ti": 288, "4060": 272,
|
||||
"3090 ti": 1008, "3090": 936, "3080 ti": 912, "3080": 760, "3070 ti": 608, "3070": 448, "3060 ti": 448, "3060": 360,
|
||||
"3090 ti": 1008, "3090": 936, "3080 ti": 912, "3080": 760, "3070 ti": 608, "3070": 448, "3060 ti": 448, "3060": 360, "3050 ti": 192, "3050": 224,
|
||||
"2080 ti": 616, "2080 super": 496, "2080": 448, "2070 super": 448, "2070": 448, "2060 super": 448, "2060": 336,
|
||||
"1660 ti": 288, "1660 super": 336, "1660": 192, "1650 super": 192, "1650": 128,
|
||||
"h100 sxm": 3350, "h100": 2039, "h200": 4800, "a100 sxm": 2039, "a100": 1555,
|
||||
|
||||
@@ -12,6 +12,7 @@ QUANT_BPP = {
|
||||
"Q4_K_M": 0.58, "Q4_0": 0.58, "Q3_K_M": 0.48, "Q2_K": 0.37,
|
||||
"AWQ-4bit": 0.50, "AWQ-8bit": 1.0,
|
||||
"GPTQ-Int4": 0.50, "GPTQ-Int8": 1.0,
|
||||
"QAT-INT4": 0.50, "QAT-INT8": 1.0,
|
||||
"mlx-4bit": 0.55, "mlx-8bit": 1.0, "mlx-6bit": 0.75,
|
||||
# DeepSeek-V4-style mixed: MoE experts in FP4 (bulk), attention + non-
|
||||
# expert dense in FP8, embeddings/LM head in BF16. By weight count the
|
||||
@@ -30,6 +31,7 @@ QUANT_SPEED_MULT = {
|
||||
"Q4_K_M": 1.15, "Q4_0": 1.15, "Q3_K_M": 1.25, "Q2_K": 1.35,
|
||||
"AWQ-4bit": 1.2, "AWQ-8bit": 0.85,
|
||||
"GPTQ-Int4": 1.2, "GPTQ-Int8": 0.85,
|
||||
"QAT-INT4": 1.15, "QAT-INT8": 0.85,
|
||||
"mlx-4bit": 1.15, "mlx-8bit": 0.85, "mlx-6bit": 1.0,
|
||||
"FP4-MoE-Mixed": 1.10, # slightly slower than pure FP4 because of mixed-dtype dispatch
|
||||
"FP8-Mixed": 0.85,
|
||||
@@ -47,6 +49,10 @@ QUANT_QUALITY_PENALTY = {
|
||||
# penalty so FP8 wins when both fit. AWQ-4bit stays heavier.
|
||||
"AWQ": -1.0, "AWQ-4bit": -4.0, "AWQ-8bit": -1.0,
|
||||
"GPTQ": -1.0, "GPTQ-Int4": -4.0, "GPTQ-Int8": -1.0,
|
||||
# Quantization-aware training recovers most of the int4 quality loss, so a
|
||||
# QAT-INT4 build lands far closer to bf16 than a post-training Q4/INT4
|
||||
# (Google reports near-bf16 quality). Penalize it lightly, not like Q4_K_M.
|
||||
"QAT-INT4": -1.0, "QAT-INT8": 0.0,
|
||||
"mlx-4bit": -4.0, "mlx-8bit": -0.5, "mlx-6bit": -1.5,
|
||||
# DeepSeek-V4 mixed: only MoE experts at FP4 (the rest is FP8/BF16),
|
||||
# so the realized quality is much closer to FP8 than to pure FP4 —
|
||||
@@ -63,6 +69,7 @@ QUANT_BYTES_PER_PARAM = {
|
||||
"Q4_K_M": 0.5, "Q4_0": 0.5, "Q3_K_M": 0.375, "Q2_K": 0.25,
|
||||
"AWQ-4bit": 0.5, "AWQ-8bit": 1.0,
|
||||
"GPTQ-Int4": 0.5, "GPTQ-Int8": 1.0,
|
||||
"QAT-INT4": 0.5, "QAT-INT8": 1.0,
|
||||
"mlx-4bit": 0.5, "mlx-8bit": 1.0, "mlx-6bit": 0.75,
|
||||
"FP4-MoE-Mixed": 0.55,
|
||||
"FP8-Mixed": 1.0,
|
||||
@@ -74,6 +81,7 @@ PREQUANTIZED_PREFIXES = (
|
||||
"AWQ-", "GPTQ-", "mlx-", "FP8", "FP4", "NVFP4", "MXFP4", "NF4",
|
||||
"INT4", "INT8", "W4A16", "W8A8", "W8A16",
|
||||
"FP4-MoE-Mixed", "FP8-Mixed",
|
||||
"QAT-",
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,51 @@
|
||||
from services.hwfit.fit import rank_models
|
||||
from services.hwfit.models import get_models, is_prequantized
|
||||
|
||||
|
||||
def _8gb_vram_system():
|
||||
return {
|
||||
"has_gpu": True,
|
||||
"backend": "cuda",
|
||||
"gpu_name": "NVIDIA GeForce RTX 4060",
|
||||
"gpu_vram_gb": 8.0,
|
||||
"gpu_count": 1,
|
||||
"available_ram_gb": 32.0,
|
||||
"total_ram_gb": 32.0,
|
||||
}
|
||||
|
||||
|
||||
def test_gemma4_12b_in_catalog():
|
||||
catalog = {m["name"]: m for m in get_models()}
|
||||
assert "google/gemma-4-12B-it" in catalog, "gemma-4-12B-it missing from catalog"
|
||||
|
||||
|
||||
def test_gemma4_12b_has_gguf_source():
|
||||
catalog = {m["name"]: m for m in get_models()}
|
||||
entry = catalog["google/gemma-4-12B-it"]
|
||||
assert entry.get("gguf_sources"), "gemma-4-12B-it has no gguf_sources"
|
||||
repos = [s["repo"] for s in entry["gguf_sources"]]
|
||||
assert "unsloth/gemma-4-12B-it-GGUF" in repos
|
||||
|
||||
|
||||
def test_gemma4_12b_rank_models_returns_it_for_8gb_vram():
|
||||
results = rank_models(_8gb_vram_system(), search="gemma-4-12B-it", limit=20)
|
||||
names = [r["name"] for r in results]
|
||||
assert "google/gemma-4-12B-it" in names, "rank_models did not return gemma-4-12B-it for 8 GB VRAM"
|
||||
|
||||
|
||||
def test_gemma4_12b_qat_entries_in_catalog():
|
||||
catalog = {m["name"]: m for m in get_models()}
|
||||
assert "google/gemma-4-12B-it-qat-int4" in catalog
|
||||
assert "google/gemma-4-12B-it-qat-int8" in catalog
|
||||
|
||||
|
||||
def test_gemma4_12b_qat_entries_are_prequantized():
|
||||
catalog = {m["name"]: m for m in get_models()}
|
||||
assert is_prequantized(catalog["google/gemma-4-12B-it-qat-int4"])
|
||||
assert is_prequantized(catalog["google/gemma-4-12B-it-qat-int8"])
|
||||
|
||||
|
||||
def test_gemma4_12b_qat_entries_have_no_gguf():
|
||||
catalog = {m["name"]: m for m in get_models()}
|
||||
assert catalog["google/gemma-4-12B-it-qat-int4"]["gguf_sources"] == []
|
||||
assert catalog["google/gemma-4-12B-it-qat-int8"]["gguf_sources"] == []
|
||||
Reference in New Issue
Block a user