Add Apple Silicon (Metal) GPU detection and unified-memory fit tuning

hardware.py detects Apple Silicon locally and over SSH, reporting
backend=metal, the chip name, and a RAM-scaled fraction of unified
memory as the usable GPU budget. fit.py gains an M1-M4 memory-bandwidth
table for realistic tok/s and drops vLLM-only formats (AWQ/GPTQ/FP8)
that can't be served on Metal.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
yunggilja
2026-05-31 20:24:38 -05:00
parent 051751adcd
commit 32ac81dbc6
2 changed files with 115 additions and 3 deletions
+94 -2
View File
@@ -204,6 +204,82 @@ def _detect_amd():
return None
def _detect_apple_silicon():
"""Detect Apple Silicon (M-series) GPUs.
Macs have no discrete VRAM — the GPU shares the system's unified memory.
We report a fraction of total RAM as the usable GPU budget (matching macOS's
default Metal working-set limit) so the Cookbook recommends models that
actually run on the GPU instead of classifying the machine as CPU-only.
backend="metal" is what services.hwfit.fit and the serve-command generation
key off of (they already understand MLX / llama.cpp-Metal). Works locally
(platform.system()=="Darwin") and over SSH (uname -s == Darwin).
"""
# Gate to macOS — locally via platform, remotely via uname.
if _remote_host:
if "darwin" not in (_run(["uname", "-s"]) or "").lower():
return None
arch = (_run(["uname", "-m"]) or "").lower()
else:
if platform.system() != "Darwin":
return None
arch = platform.machine().lower()
# Only Apple Silicon (arm64) has a Metal GPU worth serving LLMs on; Intel
# Macs fall through to the CPU path.
if "arm" not in arch and "aarch64" not in arch:
return None
# Chip name, e.g. "Apple M4 Max" — carries the Pro/Max/Ultra variant that
# the fit bandwidth table keys off of.
brand = (_run(["sysctl", "-n", "machdep.cpu.brand_string"]) or "Apple Silicon").strip()
# Total unified memory in bytes.
memsize = _run(["sysctl", "-n", "hw.memsize"])
try:
total_gb = int(memsize) / (1024**3) if memsize else 0.0
except ValueError:
total_gb = 0.0
if total_gb <= 0:
return None
# Usable GPU budget. macOS lets Metal use most of unified memory, but the
# default working-set limit scales with RAM: small machines have to keep
# more back for the OS + app. These fractions track Apple's
# recommendedMaxWorkingSetSize defaults across the lineup. Honour an
# explicit override if the user raised it with
# `sudo sysctl iogpu.wired_limit_mb=…`.
if total_gb <= 16:
frac = 0.67
elif total_gb <= 64:
frac = 0.75
else:
frac = 0.80
vram_gb = round(total_gb * frac, 1)
wired = _run(["sysctl", "-n", "iogpu.wired_limit_mb"])
try:
wired_mb = int(wired) if wired else 0
if wired_mb > 0:
vram_gb = round(wired_mb / 1024.0, 1)
except ValueError:
pass
gpu = {"index": 0, "name": brand, "vram_gb": vram_gb}
return {
"gpu_name": brand,
"gpu_vram_gb": vram_gb,
"gpu_count": 1,
"gpus": [gpu],
"gpu_groups": _group_gpus([gpu]),
"homogeneous": True,
"backend": "metal",
# Unified memory: the "VRAM" above is carved out of system RAM, not a
# separate pool — downstream fit logic uses this to avoid double-budgeting.
"unified_memory": True,
}
def _read_file(path):
"""Read a file, locally or via SSH."""
if _remote_host:
@@ -246,6 +322,15 @@ def _get_ram_gb():
return (pages * page_size) / (1024**3)
except Exception:
pass
# macOS has no /proc/meminfo — fall back to sysctl (works locally and over
# SSH to a remote Mac, where the sysconf path above isn't taken).
memsize = _run(["sysctl", "-n", "hw.memsize"])
if memsize:
try:
return int(memsize.strip()) / (1024**3)
except ValueError:
pass
return 0.0
@@ -263,6 +348,12 @@ def _get_cpu_name():
if line.startswith("model name"):
return line.split(":", 1)[1].strip()
# macOS has no /proc/cpuinfo — sysctl gives the chip name (e.g. "Apple M4").
# Harmlessly returns nothing on Linux, so it's safe to try unconditionally.
brand = _run(["sysctl", "-n", "machdep.cpu.brand_string"])
if brand and brand.strip():
return brand.strip()
if not _remote_host:
return platform.processor() or "unknown"
return "unknown"
@@ -270,7 +361,8 @@ def _get_cpu_name():
def _get_cpu_count():
if _remote_host:
out = _run(["nproc"])
# nproc on Linux; hw.ncpu via sysctl on a remote Mac (no nproc there).
out = _run(["nproc"]) or _run(["sysctl", "-n", "hw.ncpu"])
if out:
try:
return int(out.strip())
@@ -411,7 +503,7 @@ def detect_system(host="", ssh_port="", platform="", fresh=False):
cpu_cores = _get_cpu_count()
cpu_name = _get_cpu_name()
gpu_info = _detect_nvidia() or _detect_amd()
gpu_info = _detect_apple_silicon() or _detect_nvidia() or _detect_amd()
if gpu_info:
result = {