From 32ac81dbc680361463a088dae867d555d5a79c3b Mon Sep 17 00:00:00 2001 From: yunggilja Date: Sun, 31 May 2026 20:24:38 -0500 Subject: [PATCH] Add Apple Silicon (Metal) GPU detection and unified-memory fit tuning hardware.py detects Apple Silicon locally and over SSH, reporting backend=metal, the chip name, and a RAM-scaled fraction of unified memory as the usable GPU budget. fit.py gains an M1-M4 memory-bandwidth table for realistic tok/s and drops vLLM-only formats (AWQ/GPTQ/FP8) that can't be served on Metal. Co-Authored-By: Claude Opus 4.8 --- services/hwfit/fit.py | 22 ++++++++- services/hwfit/hardware.py | 96 +++++++++++++++++++++++++++++++++++++- 2 files changed, 115 insertions(+), 3 deletions(-) diff --git a/services/hwfit/fit.py b/services/hwfit/fit.py index 0cd142c53..901ff7366 100644 --- a/services/hwfit/fit.py +++ b/services/hwfit/fit.py @@ -19,12 +19,22 @@ GPU_BANDWIDTH = { "6950 xt": 576, "6900 xt": 512, "6800 xt": 512, "6800": 512, "6700 xt": 384, "6600 xt": 256, "6600": 224, "mi300x": 5300, "mi300": 5300, "mi250x": 3277, "mi250": 3277, "mi210": 1638, "mi100": 1229, "9070 xt": 624, "9070": 488, + # Apple Silicon unified-memory bandwidth (GB/s). Keyed off the chip name + # reported by sysctl machdep.cpu.brand_string (e.g. "Apple M4 Max"). Listed + # before the bare "m_" keys matters less than length-sorting (done below), + # which guarantees "m4 max" is tried before "m4". + "m1 ultra": 800, "m1 max": 400, "m1 pro": 200, "m1": 68, + "m2 ultra": 800, "m2 max": 400, "m2 pro": 200, "m2": 100, + "m3 ultra": 800, "m3 max": 300, "m3 pro": 150, "m3": 100, + "m4 max": 410, "m4 pro": 273, "m4": 120, } # Pre-sort keys by length descending for correct substring matching _BW_KEYS_SORTED = sorted(GPU_BANDWIDTH.keys(), key=len, reverse=True) -FALLBACK_K = {"cuda": 220, "rocm": 180, "cpu_x86": 70, "cpu_arm": 90} +# metal: backstop for Apple Silicon chips not in GPU_BANDWIDTH (e.g. a future +# M5) — the named chips above take the accurate bandwidth path instead. +FALLBACK_K = {"cuda": 220, "rocm": 180, "metal": 150, "cpu_x86": 70, "cpu_arm": 90} USE_CASE_WEIGHTS = { "general": (0.45, 0.30, 0.15, 0.10), @@ -424,6 +434,16 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan if not apple_silicon and native_q.startswith("mlx-"): continue + # The mirror case: vLLM-only prequant formats (AWQ / GPTQ / FP8 / NVFP4 / + # compressed-tensors) can't be served by llama.cpp or Ollama, the only + # Metal-capable engines — vLLM itself doesn't run on macOS at all. Drop + # them on Apple Silicon UNLESS the model also ships a GGUF build we can + # actually serve. Without this, the Cookbook recommends models the Mac + # can never run. + if apple_silicon and not m.get("gguf_sources"): + if native_q.upper().startswith(("AWQ", "GPTQ", "FP8", "NVFP4", "W4A16", "W8A8")): + continue + # Format filter: AWQ tab → only AWQ models, FP8 tab → only FP8 models if filter_native: if quant == "FP8" and native_q != "FP8": diff --git a/services/hwfit/hardware.py b/services/hwfit/hardware.py index 86aa77757..6afac104f 100644 --- a/services/hwfit/hardware.py +++ b/services/hwfit/hardware.py @@ -204,6 +204,82 @@ def _detect_amd(): return None +def _detect_apple_silicon(): + """Detect Apple Silicon (M-series) GPUs. + + Macs have no discrete VRAM — the GPU shares the system's unified memory. + We report a fraction of total RAM as the usable GPU budget (matching macOS's + default Metal working-set limit) so the Cookbook recommends models that + actually run on the GPU instead of classifying the machine as CPU-only. + + backend="metal" is what services.hwfit.fit and the serve-command generation + key off of (they already understand MLX / llama.cpp-Metal). Works locally + (platform.system()=="Darwin") and over SSH (uname -s == Darwin). + """ + # Gate to macOS — locally via platform, remotely via uname. + if _remote_host: + if "darwin" not in (_run(["uname", "-s"]) or "").lower(): + return None + arch = (_run(["uname", "-m"]) or "").lower() + else: + if platform.system() != "Darwin": + return None + arch = platform.machine().lower() + + # Only Apple Silicon (arm64) has a Metal GPU worth serving LLMs on; Intel + # Macs fall through to the CPU path. + if "arm" not in arch and "aarch64" not in arch: + return None + + # Chip name, e.g. "Apple M4 Max" — carries the Pro/Max/Ultra variant that + # the fit bandwidth table keys off of. + brand = (_run(["sysctl", "-n", "machdep.cpu.brand_string"]) or "Apple Silicon").strip() + + # Total unified memory in bytes. + memsize = _run(["sysctl", "-n", "hw.memsize"]) + try: + total_gb = int(memsize) / (1024**3) if memsize else 0.0 + except ValueError: + total_gb = 0.0 + if total_gb <= 0: + return None + + # Usable GPU budget. macOS lets Metal use most of unified memory, but the + # default working-set limit scales with RAM: small machines have to keep + # more back for the OS + app. These fractions track Apple's + # recommendedMaxWorkingSetSize defaults across the lineup. Honour an + # explicit override if the user raised it with + # `sudo sysctl iogpu.wired_limit_mb=…`. + if total_gb <= 16: + frac = 0.67 + elif total_gb <= 64: + frac = 0.75 + else: + frac = 0.80 + vram_gb = round(total_gb * frac, 1) + wired = _run(["sysctl", "-n", "iogpu.wired_limit_mb"]) + try: + wired_mb = int(wired) if wired else 0 + if wired_mb > 0: + vram_gb = round(wired_mb / 1024.0, 1) + except ValueError: + pass + + gpu = {"index": 0, "name": brand, "vram_gb": vram_gb} + return { + "gpu_name": brand, + "gpu_vram_gb": vram_gb, + "gpu_count": 1, + "gpus": [gpu], + "gpu_groups": _group_gpus([gpu]), + "homogeneous": True, + "backend": "metal", + # Unified memory: the "VRAM" above is carved out of system RAM, not a + # separate pool — downstream fit logic uses this to avoid double-budgeting. + "unified_memory": True, + } + + def _read_file(path): """Read a file, locally or via SSH.""" if _remote_host: @@ -246,6 +322,15 @@ def _get_ram_gb(): return (pages * page_size) / (1024**3) except Exception: pass + + # macOS has no /proc/meminfo — fall back to sysctl (works locally and over + # SSH to a remote Mac, where the sysconf path above isn't taken). + memsize = _run(["sysctl", "-n", "hw.memsize"]) + if memsize: + try: + return int(memsize.strip()) / (1024**3) + except ValueError: + pass return 0.0 @@ -263,6 +348,12 @@ def _get_cpu_name(): if line.startswith("model name"): return line.split(":", 1)[1].strip() + # macOS has no /proc/cpuinfo — sysctl gives the chip name (e.g. "Apple M4"). + # Harmlessly returns nothing on Linux, so it's safe to try unconditionally. + brand = _run(["sysctl", "-n", "machdep.cpu.brand_string"]) + if brand and brand.strip(): + return brand.strip() + if not _remote_host: return platform.processor() or "unknown" return "unknown" @@ -270,7 +361,8 @@ def _get_cpu_name(): def _get_cpu_count(): if _remote_host: - out = _run(["nproc"]) + # nproc on Linux; hw.ncpu via sysctl on a remote Mac (no nproc there). + out = _run(["nproc"]) or _run(["sysctl", "-n", "hw.ncpu"]) if out: try: return int(out.strip()) @@ -411,7 +503,7 @@ def detect_system(host="", ssh_port="", platform="", fresh=False): cpu_cores = _get_cpu_count() cpu_name = _get_cpu_name() - gpu_info = _detect_nvidia() or _detect_amd() + gpu_info = _detect_apple_silicon() or _detect_nvidia() or _detect_amd() if gpu_info: result = {