fix(hwfit): distinguish Apple Silicon bandwidth variants (#2564)

* fix: resolve Apple Silicon bandwidth variants * fix(hwfit): preserve string lookup path in _lookup_bandwidth * fix(hwfit): guard Apple bandwidth lookup against false GPU matches Add "apple" not in gn check to _lookup_apple_bandwidth() so that non-Apple GPUs with "m3"/"m4"/"m5" in their names (e.g. NVIDIA Quadro M4 000) don't incorrectly match Apple bandwidth tiers. Addresses @o3LL review comment on PR #2564.
2026-06-15 17:25:26 -04:00 · 2026-06-15 15:13:03 +02:00
parent 514d345334
commit f7aa2de410
4 changed files with 184 additions and 17 deletions
@@ -19,22 +19,32 @@ GPU_BANDWIDTH = {
    "6950 xt": 576, "6900 xt": 512, "6800 xt": 512, "6800": 512, "6700 xt": 384, "6600 xt": 256, "6600": 224,
    "mi300x": 5300, "mi300": 5300, "mi250x": 3277, "mi250": 3277, "mi210": 1638, "mi100": 1229,
    "9070 xt": 624, "9070": 488, "9060 xt": 322, "9060": 322,
-    # Apple Silicon unified-memory bandwidth (GB/s). Keyed off the chip name
-    # reported by sysctl machdep.cpu.brand_string (e.g. "Apple M4 Max"). Listed
-    # before the bare "m_" keys matters less than length-sorting (done below),
-    # which guarantees "m4 max" is tried before "m4".
-    "m1 ultra": 800, "m1 max": 400, "m1 pro": 200, "m1": 68,
-    "m2 ultra": 800, "m2 max": 400, "m2 pro": 200, "m2": 100,
-    "m3 ultra": 800, "m3 max": 300, "m3 pro": 150, "m3": 100,
-    "m4 max": 546, "m4 pro": 273, "m4": 120,
-    "m5 max": 546, "m5 pro": 273, "m5": 150,
 }

 # Pre-sort keys by length descending for correct substring matching
 _BW_KEYS_SORTED = sorted(GPU_BANDWIDTH.keys(), key=len, reverse=True)

-# metal: backstop for Apple Silicon chips not in GPU_BANDWIDTH (e.g. a future
-# M5) — the named chips above take the accurate bandwidth path instead.
+# Apple Silicon unified-memory bandwidth (GB/s). For chip families with both
+# binned and full variants under the same "Apple Mx Max" brand string, prefer
+# GPU core count when hardware detection provides it; otherwise fall back to the
+# conservative tier so speed estimates do not over-promise.
+APPLE_BANDWIDTH_FIXED = {
+    "m1 ultra": 800, "m1 max": 400, "m1 pro": 200, "m1": 68,
+    "m2 ultra": 800, "m2 max": 400, "m2 pro": 200, "m2": 100,
+    "m3 ultra": 800, "m3 pro": 150, "m3": 100,
+    "m4 pro": 273, "m4": 120,
+    "m5 pro": 307, "m5": 153,
+}
+APPLE_BANDWIDTH_BY_CORES = {
+    "m3 max": {30: 300, 40: 400},
+    "m4 max": {32: 410, 40: 546},
+    "m5 max": {32: 460, 40: 614},
+}
+_APPLE_FIXED_KEYS_SORTED = sorted(APPLE_BANDWIDTH_FIXED.keys(), key=len, reverse=True)
+_APPLE_VARIANT_KEYS_SORTED = sorted(APPLE_BANDWIDTH_BY_CORES.keys(), key=len, reverse=True)
+
+# metal: backstop for Apple Silicon chips not in the explicit tables above
+# (e.g. a future M6) — use a conservative generic estimate when unknown.
 FALLBACK_K = {"cuda": 220, "rocm": 180, "metal": 150, "cpu_x86": 70, "cpu_arm": 90}

 USE_CASE_WEIGHTS = {
@@ -60,10 +70,51 @@ CONTEXT_TARGET = {
 }


-def _lookup_bandwidth(gpu_name):
+def _lookup_apple_bandwidth(system):
+    gpu_name = system.get("gpu_name")
    if not isinstance(gpu_name, str) or not gpu_name:
        return None
    gn = gpu_name.lower()
+
+    # Guard against false matches on non-Apple GPUs whose names contain
+    # "m3"/"m4"/"m5" (e.g. NVIDIA Quadro M4 000).
+    if "apple" not in gn:
+        return None
+
+    raw_cores = system.get("gpu_cores")
+    try:
+        gpu_cores = int(raw_cores) if raw_cores is not None else None
+    except (TypeError, ValueError):
+        gpu_cores = None
+
+    for key in _APPLE_VARIANT_KEYS_SORTED:
+        if key not in gn:
+            continue
+        if gpu_cores in APPLE_BANDWIDTH_BY_CORES[key]:
+            return APPLE_BANDWIDTH_BY_CORES[key][gpu_cores]
+        return min(APPLE_BANDWIDTH_BY_CORES[key].values())
+
+    for key in _APPLE_FIXED_KEYS_SORTED:
+        if key in gn:
+            return APPLE_BANDWIDTH_FIXED[key]
+    return None
+
+
+def _lookup_bandwidth(system):
+    if isinstance(system, dict):
+        gpu_name = system.get("gpu_name")
+    else:
+        gpu_name = system
+
+    if not isinstance(gpu_name, str) or not gpu_name:
+        return None
+
+    if isinstance(system, dict):
+        bw = _lookup_apple_bandwidth(system)
+        if bw is not None:
+            return bw
+
+    gn = gpu_name.lower()
    for key in _BW_KEYS_SORTED:
        if key in gn:
            return GPU_BANDWIDTH[key]
@@ -84,7 +135,7 @@ def _estimate_speed(model, quant, run_mode, system, offload_frac=0.0):
    """
    pb = _active_params_b(model)
    is_moe = model.get("is_moe", False)
-    bw = _lookup_bandwidth(system.get("gpu_name"))
+    bw = _lookup_bandwidth(system)
    backend = system.get("backend", "cpu_x86")

    if bw and run_mode in ("gpu", "cpu_offload"):
@@ -1,3 +1,4 @@
+import json
 import os
 import platform
 import re
@@ -335,6 +336,37 @@ def _detect_apple_silicon():
    if total_gb <= 0:
        return None

+    def _parse_apple_gpu_cores(text):
+        if not text:
+            return None
+        try:
+            data = json.loads(text)
+        except (TypeError, ValueError, json.JSONDecodeError):
+            data = None
+        if isinstance(data, dict):
+            for gpu in data.get("SPDisplaysDataType") or []:
+                if not isinstance(gpu, dict):
+                    continue
+                model = str(gpu.get("sppci_model") or gpu.get("_name") or "")
+                if "apple" not in model.lower():
+                    continue
+                cores = gpu.get("sppci_cores")
+                try:
+                    return int(str(cores).strip())
+                except (TypeError, ValueError):
+                    continue
+        m = re.search(r"Total Number of Cores:\s*(\d+)", text)
+        if m:
+            try:
+                return int(m.group(1))
+            except ValueError:
+                return None
+        return None
+
+    gpu_cores = _parse_apple_gpu_cores(_run(["system_profiler", "SPDisplaysDataType", "-json"]))
+    if gpu_cores is None:
+        gpu_cores = _parse_apple_gpu_cores(_run(["system_profiler", "SPDisplaysDataType"]))
+
    # Usable GPU budget. macOS lets Metal use most of unified memory, but the
    # default working-set limit scales with RAM: small machines have to keep
    # more back for the OS + app. These fractions track Apple's
@@ -357,7 +389,7 @@ def _detect_apple_silicon():
        pass

    gpu = {"index": 0, "name": brand, "vram_gb": vram_gb}
-    return {
+    info = {
        "gpu_name": brand,
        "gpu_vram_gb": vram_gb,
        "gpu_count": 1,
@@ -369,6 +401,9 @@ def _detect_apple_silicon():
        # separate pool — downstream fit logic uses this to avoid double-budgeting.
        "unified_memory": True,
    }
+    if gpu_cores is not None:
+        info["gpu_cores"] = gpu_cores
+    return info


 def _read_file(path):
@@ -772,6 +807,7 @@ def detect_system(host="", ssh_port="", platform="", fresh=False):
            "gpu_name": gpu_info["gpu_name"],
            "gpu_vram_gb": gpu_info["gpu_vram_gb"],
            "gpu_count": gpu_info["gpu_count"],
+            "gpu_cores": gpu_info.get("gpu_cores"),
            "gpus": gpu_info.get("gpus", []),
            "gpu_groups": gpu_info.get("gpu_groups", []),
            "homogeneous": gpu_info.get("homogeneous", True),