diff --git a/services/hwfit/fit.py b/services/hwfit/fit.py index 09aea29db..10ab286e0 100644 --- a/services/hwfit/fit.py +++ b/services/hwfit/fit.py @@ -19,22 +19,32 @@ GPU_BANDWIDTH = { "6950 xt": 576, "6900 xt": 512, "6800 xt": 512, "6800": 512, "6700 xt": 384, "6600 xt": 256, "6600": 224, "mi300x": 5300, "mi300": 5300, "mi250x": 3277, "mi250": 3277, "mi210": 1638, "mi100": 1229, "9070 xt": 624, "9070": 488, "9060 xt": 322, "9060": 322, - # Apple Silicon unified-memory bandwidth (GB/s). Keyed off the chip name - # reported by sysctl machdep.cpu.brand_string (e.g. "Apple M4 Max"). Listed - # before the bare "m_" keys matters less than length-sorting (done below), - # which guarantees "m4 max" is tried before "m4". - "m1 ultra": 800, "m1 max": 400, "m1 pro": 200, "m1": 68, - "m2 ultra": 800, "m2 max": 400, "m2 pro": 200, "m2": 100, - "m3 ultra": 800, "m3 max": 300, "m3 pro": 150, "m3": 100, - "m4 max": 546, "m4 pro": 273, "m4": 120, - "m5 max": 546, "m5 pro": 273, "m5": 150, } # Pre-sort keys by length descending for correct substring matching _BW_KEYS_SORTED = sorted(GPU_BANDWIDTH.keys(), key=len, reverse=True) -# metal: backstop for Apple Silicon chips not in GPU_BANDWIDTH (e.g. a future -# M5) — the named chips above take the accurate bandwidth path instead. +# Apple Silicon unified-memory bandwidth (GB/s). For chip families with both +# binned and full variants under the same "Apple Mx Max" brand string, prefer +# GPU core count when hardware detection provides it; otherwise fall back to the +# conservative tier so speed estimates do not over-promise. +APPLE_BANDWIDTH_FIXED = { + "m1 ultra": 800, "m1 max": 400, "m1 pro": 200, "m1": 68, + "m2 ultra": 800, "m2 max": 400, "m2 pro": 200, "m2": 100, + "m3 ultra": 800, "m3 pro": 150, "m3": 100, + "m4 pro": 273, "m4": 120, + "m5 pro": 307, "m5": 153, +} +APPLE_BANDWIDTH_BY_CORES = { + "m3 max": {30: 300, 40: 400}, + "m4 max": {32: 410, 40: 546}, + "m5 max": {32: 460, 40: 614}, +} +_APPLE_FIXED_KEYS_SORTED = sorted(APPLE_BANDWIDTH_FIXED.keys(), key=len, reverse=True) +_APPLE_VARIANT_KEYS_SORTED = sorted(APPLE_BANDWIDTH_BY_CORES.keys(), key=len, reverse=True) + +# metal: backstop for Apple Silicon chips not in the explicit tables above +# (e.g. a future M6) — use a conservative generic estimate when unknown. FALLBACK_K = {"cuda": 220, "rocm": 180, "metal": 150, "cpu_x86": 70, "cpu_arm": 90} USE_CASE_WEIGHTS = { @@ -60,10 +70,51 @@ CONTEXT_TARGET = { } -def _lookup_bandwidth(gpu_name): +def _lookup_apple_bandwidth(system): + gpu_name = system.get("gpu_name") if not isinstance(gpu_name, str) or not gpu_name: return None gn = gpu_name.lower() + + # Guard against false matches on non-Apple GPUs whose names contain + # "m3"/"m4"/"m5" (e.g. NVIDIA Quadro M4 000). + if "apple" not in gn: + return None + + raw_cores = system.get("gpu_cores") + try: + gpu_cores = int(raw_cores) if raw_cores is not None else None + except (TypeError, ValueError): + gpu_cores = None + + for key in _APPLE_VARIANT_KEYS_SORTED: + if key not in gn: + continue + if gpu_cores in APPLE_BANDWIDTH_BY_CORES[key]: + return APPLE_BANDWIDTH_BY_CORES[key][gpu_cores] + return min(APPLE_BANDWIDTH_BY_CORES[key].values()) + + for key in _APPLE_FIXED_KEYS_SORTED: + if key in gn: + return APPLE_BANDWIDTH_FIXED[key] + return None + + +def _lookup_bandwidth(system): + if isinstance(system, dict): + gpu_name = system.get("gpu_name") + else: + gpu_name = system + + if not isinstance(gpu_name, str) or not gpu_name: + return None + + if isinstance(system, dict): + bw = _lookup_apple_bandwidth(system) + if bw is not None: + return bw + + gn = gpu_name.lower() for key in _BW_KEYS_SORTED: if key in gn: return GPU_BANDWIDTH[key] @@ -84,7 +135,7 @@ def _estimate_speed(model, quant, run_mode, system, offload_frac=0.0): """ pb = _active_params_b(model) is_moe = model.get("is_moe", False) - bw = _lookup_bandwidth(system.get("gpu_name")) + bw = _lookup_bandwidth(system) backend = system.get("backend", "cpu_x86") if bw and run_mode in ("gpu", "cpu_offload"): diff --git a/services/hwfit/hardware.py b/services/hwfit/hardware.py index 9d868f257..a3ad7ba05 100644 --- a/services/hwfit/hardware.py +++ b/services/hwfit/hardware.py @@ -1,3 +1,4 @@ +import json import os import platform import re @@ -335,6 +336,37 @@ def _detect_apple_silicon(): if total_gb <= 0: return None + def _parse_apple_gpu_cores(text): + if not text: + return None + try: + data = json.loads(text) + except (TypeError, ValueError, json.JSONDecodeError): + data = None + if isinstance(data, dict): + for gpu in data.get("SPDisplaysDataType") or []: + if not isinstance(gpu, dict): + continue + model = str(gpu.get("sppci_model") or gpu.get("_name") or "") + if "apple" not in model.lower(): + continue + cores = gpu.get("sppci_cores") + try: + return int(str(cores).strip()) + except (TypeError, ValueError): + continue + m = re.search(r"Total Number of Cores:\s*(\d+)", text) + if m: + try: + return int(m.group(1)) + except ValueError: + return None + return None + + gpu_cores = _parse_apple_gpu_cores(_run(["system_profiler", "SPDisplaysDataType", "-json"])) + if gpu_cores is None: + gpu_cores = _parse_apple_gpu_cores(_run(["system_profiler", "SPDisplaysDataType"])) + # Usable GPU budget. macOS lets Metal use most of unified memory, but the # default working-set limit scales with RAM: small machines have to keep # more back for the OS + app. These fractions track Apple's @@ -357,7 +389,7 @@ def _detect_apple_silicon(): pass gpu = {"index": 0, "name": brand, "vram_gb": vram_gb} - return { + info = { "gpu_name": brand, "gpu_vram_gb": vram_gb, "gpu_count": 1, @@ -369,6 +401,9 @@ def _detect_apple_silicon(): # separate pool — downstream fit logic uses this to avoid double-budgeting. "unified_memory": True, } + if gpu_cores is not None: + info["gpu_cores"] = gpu_cores + return info def _read_file(path): @@ -772,6 +807,7 @@ def detect_system(host="", ssh_port="", platform="", fresh=False): "gpu_name": gpu_info["gpu_name"], "gpu_vram_gb": gpu_info["gpu_vram_gb"], "gpu_count": gpu_info["gpu_count"], + "gpu_cores": gpu_info.get("gpu_cores"), "gpus": gpu_info.get("gpus", []), "gpu_groups": gpu_info.get("gpu_groups", []), "homogeneous": gpu_info.get("homogeneous", True), diff --git a/tests/test_hwfit_apple_bandwidth.py b/tests/test_hwfit_apple_bandwidth.py new file mode 100644 index 000000000..f5b6df3d4 --- /dev/null +++ b/tests/test_hwfit_apple_bandwidth.py @@ -0,0 +1,40 @@ +from services.hwfit.fit import _lookup_bandwidth + + +def test_m3_max_bandwidth_uses_gpu_cores(): + assert _lookup_bandwidth({"gpu_name": "Apple M3 Max", "gpu_cores": 30}) == 300 + assert _lookup_bandwidth({"gpu_name": "Apple M3 Max", "gpu_cores": 40}) == 400 + + +def test_m4_max_bandwidth_uses_gpu_cores(): + assert _lookup_bandwidth({"gpu_name": "Apple M4 Max", "gpu_cores": 32}) == 410 + assert _lookup_bandwidth({"gpu_name": "Apple M4 Max", "gpu_cores": 40}) == 546 + + +def test_m5_max_bandwidth_uses_gpu_cores(): + assert _lookup_bandwidth({"gpu_name": "Apple M5 Max", "gpu_cores": 32}) == 460 + assert _lookup_bandwidth({"gpu_name": "Apple M5 Max", "gpu_cores": 40}) == 614 + + +def test_apple_max_bandwidth_falls_back_conservatively_without_gpu_cores(): + assert _lookup_bandwidth({"gpu_name": "Apple M3 Max"}) == 300 + assert _lookup_bandwidth({"gpu_name": "Apple M4 Max"}) == 410 + assert _lookup_bandwidth({"gpu_name": "Apple M5 Max"}) == 460 + + +def test_fixed_apple_bandwidth_entries_include_updated_m5_values(): + assert _lookup_bandwidth({"gpu_name": "Apple M5 Pro"}) == 307 + assert _lookup_bandwidth({"gpu_name": "Apple M5"}) == 153 + + +def test_non_apple_gpu_does_not_match_apple_bandwidth(): + """NVIDIA Quadro M4 000 should NOT match Apple bandwidth lookup.""" + assert _lookup_bandwidth({"gpu_name": "NVIDIA Quadro M4 000"}) is None + assert _lookup_bandwidth({"gpu_name": "NVIDIA Quadro M3 000"}) is None + assert _lookup_bandwidth({"gpu_name": "NVIDIA Quadro M5 000"}) is None + + +def test_non_apple_gpu_with_cores_does_not_match(): + """NVIDIA GPU with core count should not match Apple bandwidth.""" + assert _lookup_bandwidth({"gpu_name": "NVIDIA GeForce RTX 4090", "gpu_cores": 128}) is None + assert _lookup_bandwidth({"gpu_name": "AMD Radeon RX 9070 XT", "gpu_cores": 64}) is None diff --git a/tests/test_hwfit_macos.py b/tests/test_hwfit_macos.py index b0f7b9ba4..a979d14eb 100644 --- a/tests/test_hwfit_macos.py +++ b/tests/test_hwfit_macos.py @@ -4,6 +4,8 @@ Covers the Metal-specific behavior added for Apple Silicon and locks in the guarantee that non-macOS (Linux/Windows) detection is unchanged. """ +import json + from services.hwfit import hardware from services.hwfit.fit import rank_models from services.hwfit.models import get_models @@ -22,7 +24,7 @@ def _metal_system(ram_gb=16.0, vram_gb=10.7): } -def _fake_sysctl(brand="Apple M2 Pro", memsize_gb=32, wired_mb=None): +def _fake_sysctl(brand="Apple M2 Pro", memsize_gb=32, wired_mb=None, display_json=None, display_text=None): def run(cmd): joined = " ".join(cmd) if "machdep.cpu.brand_string" in joined: @@ -31,6 +33,12 @@ def _fake_sysctl(brand="Apple M2 Pro", memsize_gb=32, wired_mb=None): return str(int(memsize_gb * 1024**3)) if "iogpu.wired_limit_mb" in joined: return str(wired_mb) if wired_mb is not None else None + if "system_profiler SPDisplaysDataType -json" in joined: + if isinstance(display_json, (dict, list)): + return json.dumps(display_json) + return display_json + if "system_profiler SPDisplaysDataType" in joined: + return display_text return None return run @@ -98,16 +106,47 @@ def test_apple_silicon_detected_as_metal(monkeypatch): monkeypatch.setattr(hardware, "_remote_host", None) monkeypatch.setattr(hardware.platform, "system", lambda: "Darwin") monkeypatch.setattr(hardware.platform, "machine", lambda: "arm64") - monkeypatch.setattr(hardware, "_run", _fake_sysctl(memsize_gb=32)) + monkeypatch.setattr(hardware, "_run", _fake_sysctl( + memsize_gb=32, + display_json={"SPDisplaysDataType": [{"sppci_model": "Apple M2 Pro", "sppci_cores": "19"}]}, + )) info = hardware._detect_apple_silicon() assert info is not None assert info["backend"] == "metal" assert info["gpu_name"] == "Apple M2 Pro" assert info["unified_memory"] is True + assert info["gpu_cores"] == 19 assert info["gpu_vram_gb"] == 24.0 # 32GB * 0.75 +def test_apple_silicon_gpu_cores_fall_back_to_plain_text(monkeypatch): + monkeypatch.setattr(hardware, "_remote_host", None) + monkeypatch.setattr(hardware.platform, "system", lambda: "Darwin") + monkeypatch.setattr(hardware.platform, "machine", lambda: "arm64") + monkeypatch.setattr(hardware, "_run", _fake_sysctl( + brand="Apple M4 Max", + memsize_gb=64, + display_json="{not-json", + display_text="Graphics/Displays:\n\nApple M4 Max:\n Total Number of Cores: 32\n", + )) + + info = hardware._detect_apple_silicon() + assert info is not None + assert info["gpu_cores"] == 32 + + +def test_apple_silicon_gpu_cores_are_optional(monkeypatch): + monkeypatch.setattr(hardware, "_remote_host", None) + monkeypatch.setattr(hardware.platform, "system", lambda: "Darwin") + monkeypatch.setattr(hardware.platform, "machine", lambda: "arm64") + monkeypatch.setattr(hardware, "_run", _fake_sysctl(memsize_gb=32)) + + info = hardware._detect_apple_silicon() + assert info is not None + assert "gpu_cores" not in info + + def test_apple_silicon_skipped_on_linux(monkeypatch): """Guarantee Linux detection is untouched: the Metal probe bails immediately.""" monkeypatch.setattr(hardware, "_remote_host", None) @@ -132,7 +171,7 @@ def test_detect_system_propagates_unified_memory(monkeypatch): monkeypatch.setattr(hardware, "_detect_apple_silicon", lambda: { "gpu_name": "Apple M4", "gpu_vram_gb": 10.7, "gpu_count": 1, "gpus": [], "gpu_groups": [], "homogeneous": True, - "backend": "metal", "unified_memory": True, + "backend": "metal", "unified_memory": True, "gpu_cores": 10, }) monkeypatch.setattr(hardware, "_get_ram_gb", lambda: 16.0) monkeypatch.setattr(hardware, "_get_available_ram_gb", lambda: 11.0) @@ -142,3 +181,4 @@ def test_detect_system_propagates_unified_memory(monkeypatch): s = hardware.detect_system(fresh=True) assert s["backend"] == "metal" assert s.get("unified_memory") is True + assert s["gpu_cores"] == 10