Merge remote-tracking branch 'origin/main' into visual-pr-playground

# Conflicts:
#	routes/cookbook_routes.py
#	routes/hwfit_routes.py
#	services/hwfit/fit.py
#	services/hwfit/models.py
#	static/js/cookbook-diagnosis.js
#	static/js/cookbook-hwfit.js
#	static/js/cookbook.js
#	static/js/cookbookRunning.js
This commit is contained in:
pewdiepie-archdaemon
2026-06-03 16:49:10 +09:00
569 changed files with 35252 additions and 3489 deletions
+17 -3
View File
@@ -61,7 +61,7 @@ CONTEXT_TARGET = {
def _lookup_bandwidth(gpu_name):
if not gpu_name:
if not isinstance(gpu_name, str) or not gpu_name:
return None
gn = gpu_name.lower()
for key in _BW_KEYS_SORTED:
@@ -280,10 +280,14 @@ def _native_quant(model):
return "FP8"
if "gptq" in text:
m = re.search(r"(?:gptq|int|w)(?:[-_]?)(\d{1,2})(?:bit)?", text)
return f"GPTQ-{m.group(1)}bit" if m else "GPTQ"
# Canonical catalog label is "GPTQ-Int4"/"GPTQ-Int8" (see models.py
# QUANT_BPP / QUANT_QUALITY_PENALTY keys); "GPTQ-4bit" misses both
# maps, so BPP and the quality penalty silently fall to defaults.
return f"GPTQ-Int{m.group(1)}" if m else "GPTQ-Int4"
if "awq" in text:
m = re.search(r"(?:awq|int|w)(?:[-_]?)(\d{1,2})(?:bit)?", text)
return f"AWQ-{m.group(1)}bit" if m else "AWQ"
# Catalog keys are "AWQ-4bit"/"AWQ-8bit"; bare "AWQ" misses the maps.
return f"AWQ-{m.group(1)}bit" if m else "AWQ-4bit"
if "mlx" in text:
m = re.search(r"mlx[-_]?(\d{1,2})bit", text)
return f"mlx-{m.group(1)}bit" if m else native_quant
@@ -571,6 +575,8 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
system_backend = (system.get("backend") or "").lower()
apple_silicon = system_backend in ("mps", "metal", "apple")
rocm = system_backend == "rocm"
# Consumer AMD Radeon (RDNA, gfx10/11/12): the practical local serving path
# is GGUF via llama.cpp. vLLM/SGLang on ROCm are validated for datacenter
# Instinct (CDNA, gfx9xx) but are unreliable on consumer RDNA — AWQ kernels
@@ -589,6 +595,14 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
if native_q.startswith("mlx-") or "mlx" in (m.get("name") or "").lower():
continue
# ROCm support for vLLM/SGLang quantized safetensors is too brittle to
# recommend blindly in the default scan. Keep AWQ/GPTQ/FP8 discoverable
# only when the user explicitly picks that format from the quant filter;
# otherwise prefer GGUF/Q* entries that Odysseus can route through
# llama.cpp/Ollama without pretending "fits VRAM" means "servable".
if rocm and is_prequantized(m) and not filter_native:
continue
# On Apple Silicon the only serving engines are llama.cpp and Ollama,
# both GGUF-only (vLLM/SGLang are CUDA/ROCm and don't run on macOS). So
# a model is Metal-servable ONLY if it ships a real GGUF. Drop everything