mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-17 10:15:27 -04:00
Add Apple Silicon (Metal) GPU detection and unified-memory fit tuning
hardware.py detects Apple Silicon locally and over SSH, reporting backend=metal, the chip name, and a RAM-scaled fraction of unified memory as the usable GPU budget. fit.py gains an M1-M4 memory-bandwidth table for realistic tok/s and drops vLLM-only formats (AWQ/GPTQ/FP8) that can't be served on Metal. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
+21
-1
@@ -19,12 +19,22 @@ GPU_BANDWIDTH = {
|
|||||||
"6950 xt": 576, "6900 xt": 512, "6800 xt": 512, "6800": 512, "6700 xt": 384, "6600 xt": 256, "6600": 224,
|
"6950 xt": 576, "6900 xt": 512, "6800 xt": 512, "6800": 512, "6700 xt": 384, "6600 xt": 256, "6600": 224,
|
||||||
"mi300x": 5300, "mi300": 5300, "mi250x": 3277, "mi250": 3277, "mi210": 1638, "mi100": 1229,
|
"mi300x": 5300, "mi300": 5300, "mi250x": 3277, "mi250": 3277, "mi210": 1638, "mi100": 1229,
|
||||||
"9070 xt": 624, "9070": 488,
|
"9070 xt": 624, "9070": 488,
|
||||||
|
# Apple Silicon unified-memory bandwidth (GB/s). Keyed off the chip name
|
||||||
|
# reported by sysctl machdep.cpu.brand_string (e.g. "Apple M4 Max"). Listed
|
||||||
|
# before the bare "m_" keys matters less than length-sorting (done below),
|
||||||
|
# which guarantees "m4 max" is tried before "m4".
|
||||||
|
"m1 ultra": 800, "m1 max": 400, "m1 pro": 200, "m1": 68,
|
||||||
|
"m2 ultra": 800, "m2 max": 400, "m2 pro": 200, "m2": 100,
|
||||||
|
"m3 ultra": 800, "m3 max": 300, "m3 pro": 150, "m3": 100,
|
||||||
|
"m4 max": 410, "m4 pro": 273, "m4": 120,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Pre-sort keys by length descending for correct substring matching
|
# Pre-sort keys by length descending for correct substring matching
|
||||||
_BW_KEYS_SORTED = sorted(GPU_BANDWIDTH.keys(), key=len, reverse=True)
|
_BW_KEYS_SORTED = sorted(GPU_BANDWIDTH.keys(), key=len, reverse=True)
|
||||||
|
|
||||||
FALLBACK_K = {"cuda": 220, "rocm": 180, "cpu_x86": 70, "cpu_arm": 90}
|
# metal: backstop for Apple Silicon chips not in GPU_BANDWIDTH (e.g. a future
|
||||||
|
# M5) — the named chips above take the accurate bandwidth path instead.
|
||||||
|
FALLBACK_K = {"cuda": 220, "rocm": 180, "metal": 150, "cpu_x86": 70, "cpu_arm": 90}
|
||||||
|
|
||||||
USE_CASE_WEIGHTS = {
|
USE_CASE_WEIGHTS = {
|
||||||
"general": (0.45, 0.30, 0.15, 0.10),
|
"general": (0.45, 0.30, 0.15, 0.10),
|
||||||
@@ -424,6 +434,16 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
|
|||||||
if not apple_silicon and native_q.startswith("mlx-"):
|
if not apple_silicon and native_q.startswith("mlx-"):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# The mirror case: vLLM-only prequant formats (AWQ / GPTQ / FP8 / NVFP4 /
|
||||||
|
# compressed-tensors) can't be served by llama.cpp or Ollama, the only
|
||||||
|
# Metal-capable engines — vLLM itself doesn't run on macOS at all. Drop
|
||||||
|
# them on Apple Silicon UNLESS the model also ships a GGUF build we can
|
||||||
|
# actually serve. Without this, the Cookbook recommends models the Mac
|
||||||
|
# can never run.
|
||||||
|
if apple_silicon and not m.get("gguf_sources"):
|
||||||
|
if native_q.upper().startswith(("AWQ", "GPTQ", "FP8", "NVFP4", "W4A16", "W8A8")):
|
||||||
|
continue
|
||||||
|
|
||||||
# Format filter: AWQ tab → only AWQ models, FP8 tab → only FP8 models
|
# Format filter: AWQ tab → only AWQ models, FP8 tab → only FP8 models
|
||||||
if filter_native:
|
if filter_native:
|
||||||
if quant == "FP8" and native_q != "FP8":
|
if quant == "FP8" and native_q != "FP8":
|
||||||
|
|||||||
@@ -204,6 +204,82 @@ def _detect_amd():
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_apple_silicon():
|
||||||
|
"""Detect Apple Silicon (M-series) GPUs.
|
||||||
|
|
||||||
|
Macs have no discrete VRAM — the GPU shares the system's unified memory.
|
||||||
|
We report a fraction of total RAM as the usable GPU budget (matching macOS's
|
||||||
|
default Metal working-set limit) so the Cookbook recommends models that
|
||||||
|
actually run on the GPU instead of classifying the machine as CPU-only.
|
||||||
|
|
||||||
|
backend="metal" is what services.hwfit.fit and the serve-command generation
|
||||||
|
key off of (they already understand MLX / llama.cpp-Metal). Works locally
|
||||||
|
(platform.system()=="Darwin") and over SSH (uname -s == Darwin).
|
||||||
|
"""
|
||||||
|
# Gate to macOS — locally via platform, remotely via uname.
|
||||||
|
if _remote_host:
|
||||||
|
if "darwin" not in (_run(["uname", "-s"]) or "").lower():
|
||||||
|
return None
|
||||||
|
arch = (_run(["uname", "-m"]) or "").lower()
|
||||||
|
else:
|
||||||
|
if platform.system() != "Darwin":
|
||||||
|
return None
|
||||||
|
arch = platform.machine().lower()
|
||||||
|
|
||||||
|
# Only Apple Silicon (arm64) has a Metal GPU worth serving LLMs on; Intel
|
||||||
|
# Macs fall through to the CPU path.
|
||||||
|
if "arm" not in arch and "aarch64" not in arch:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Chip name, e.g. "Apple M4 Max" — carries the Pro/Max/Ultra variant that
|
||||||
|
# the fit bandwidth table keys off of.
|
||||||
|
brand = (_run(["sysctl", "-n", "machdep.cpu.brand_string"]) or "Apple Silicon").strip()
|
||||||
|
|
||||||
|
# Total unified memory in bytes.
|
||||||
|
memsize = _run(["sysctl", "-n", "hw.memsize"])
|
||||||
|
try:
|
||||||
|
total_gb = int(memsize) / (1024**3) if memsize else 0.0
|
||||||
|
except ValueError:
|
||||||
|
total_gb = 0.0
|
||||||
|
if total_gb <= 0:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Usable GPU budget. macOS lets Metal use most of unified memory, but the
|
||||||
|
# default working-set limit scales with RAM: small machines have to keep
|
||||||
|
# more back for the OS + app. These fractions track Apple's
|
||||||
|
# recommendedMaxWorkingSetSize defaults across the lineup. Honour an
|
||||||
|
# explicit override if the user raised it with
|
||||||
|
# `sudo sysctl iogpu.wired_limit_mb=…`.
|
||||||
|
if total_gb <= 16:
|
||||||
|
frac = 0.67
|
||||||
|
elif total_gb <= 64:
|
||||||
|
frac = 0.75
|
||||||
|
else:
|
||||||
|
frac = 0.80
|
||||||
|
vram_gb = round(total_gb * frac, 1)
|
||||||
|
wired = _run(["sysctl", "-n", "iogpu.wired_limit_mb"])
|
||||||
|
try:
|
||||||
|
wired_mb = int(wired) if wired else 0
|
||||||
|
if wired_mb > 0:
|
||||||
|
vram_gb = round(wired_mb / 1024.0, 1)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
gpu = {"index": 0, "name": brand, "vram_gb": vram_gb}
|
||||||
|
return {
|
||||||
|
"gpu_name": brand,
|
||||||
|
"gpu_vram_gb": vram_gb,
|
||||||
|
"gpu_count": 1,
|
||||||
|
"gpus": [gpu],
|
||||||
|
"gpu_groups": _group_gpus([gpu]),
|
||||||
|
"homogeneous": True,
|
||||||
|
"backend": "metal",
|
||||||
|
# Unified memory: the "VRAM" above is carved out of system RAM, not a
|
||||||
|
# separate pool — downstream fit logic uses this to avoid double-budgeting.
|
||||||
|
"unified_memory": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def _read_file(path):
|
def _read_file(path):
|
||||||
"""Read a file, locally or via SSH."""
|
"""Read a file, locally or via SSH."""
|
||||||
if _remote_host:
|
if _remote_host:
|
||||||
@@ -246,6 +322,15 @@ def _get_ram_gb():
|
|||||||
return (pages * page_size) / (1024**3)
|
return (pages * page_size) / (1024**3)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# macOS has no /proc/meminfo — fall back to sysctl (works locally and over
|
||||||
|
# SSH to a remote Mac, where the sysconf path above isn't taken).
|
||||||
|
memsize = _run(["sysctl", "-n", "hw.memsize"])
|
||||||
|
if memsize:
|
||||||
|
try:
|
||||||
|
return int(memsize.strip()) / (1024**3)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
return 0.0
|
return 0.0
|
||||||
|
|
||||||
|
|
||||||
@@ -263,6 +348,12 @@ def _get_cpu_name():
|
|||||||
if line.startswith("model name"):
|
if line.startswith("model name"):
|
||||||
return line.split(":", 1)[1].strip()
|
return line.split(":", 1)[1].strip()
|
||||||
|
|
||||||
|
# macOS has no /proc/cpuinfo — sysctl gives the chip name (e.g. "Apple M4").
|
||||||
|
# Harmlessly returns nothing on Linux, so it's safe to try unconditionally.
|
||||||
|
brand = _run(["sysctl", "-n", "machdep.cpu.brand_string"])
|
||||||
|
if brand and brand.strip():
|
||||||
|
return brand.strip()
|
||||||
|
|
||||||
if not _remote_host:
|
if not _remote_host:
|
||||||
return platform.processor() or "unknown"
|
return platform.processor() or "unknown"
|
||||||
return "unknown"
|
return "unknown"
|
||||||
@@ -270,7 +361,8 @@ def _get_cpu_name():
|
|||||||
|
|
||||||
def _get_cpu_count():
|
def _get_cpu_count():
|
||||||
if _remote_host:
|
if _remote_host:
|
||||||
out = _run(["nproc"])
|
# nproc on Linux; hw.ncpu via sysctl on a remote Mac (no nproc there).
|
||||||
|
out = _run(["nproc"]) or _run(["sysctl", "-n", "hw.ncpu"])
|
||||||
if out:
|
if out:
|
||||||
try:
|
try:
|
||||||
return int(out.strip())
|
return int(out.strip())
|
||||||
@@ -411,7 +503,7 @@ def detect_system(host="", ssh_port="", platform="", fresh=False):
|
|||||||
cpu_cores = _get_cpu_count()
|
cpu_cores = _get_cpu_count()
|
||||||
cpu_name = _get_cpu_name()
|
cpu_name = _get_cpu_name()
|
||||||
|
|
||||||
gpu_info = _detect_nvidia() or _detect_amd()
|
gpu_info = _detect_apple_silicon() or _detect_nvidia() or _detect_amd()
|
||||||
|
|
||||||
if gpu_info:
|
if gpu_info:
|
||||||
result = {
|
result = {
|
||||||
|
|||||||
Reference in New Issue
Block a user