Merge origin/dev into main

2026-06-28 15:45:22 -04:00 · 2026-06-21 11:08:50 +00:00
parent c504214925 160267417e
commit 75f04bc088
203 changed files with 11283 additions and 1649 deletions
@@ -19,6 +19,10 @@ GPU_BANDWIDTH = {
    "6950 xt": 576, "6900 xt": 512, "6800 xt": 512, "6800": 512, "6700 xt": 384, "6600 xt": 256, "6600": 224,
    "mi300x": 5300, "mi300": 5300, "mi250x": 3277, "mi250": 3277, "mi210": 1638, "mi100": 1229,
    "9070 xt": 624, "9070": 488, "9060 xt": 322, "9060": 322,
+    # NVIDIA GB10 Grace-Blackwell superchip (DGX Spark). Unified LPDDR5X memory,
+    # not Apple Silicon, so it lives in the generic GPU table — the Apple-only
+    # lookup never matches it (its name carries no "apple").
+    "gb10": 273,
 }

 # Pre-sort keys by length descending for correct substring matching
@@ -126,6 +130,44 @@ def _lookup_bandwidth(system):
    return None


+def _canonical_cpu_backend(system):
+    """Return the canonical CPU backend for cpu_only speed estimation.
+
+    Normalizes CPU-architecture aliases separately from the GPU backend, and
+    overrides GPU-only backends (CUDA/ROCm/Metal) so they do not inherit a
+    discrete-GPU fallback constant when the model is actually running on CPU.
+    """
+    backend = (system.get("backend") or "").lower().strip()
+    cpu_arch = (system.get("cpu_arch") or "").lower().strip()
+    cpu_name = (system.get("cpu_name") or "").lower()
+    gpu_name = (system.get("gpu_name") or "").lower()
+
+    # Already-canonical CPU backends
+    if backend in ("cpu_x86", "cpu_arm"):
+        return backend
+
+    # Raw CPU-architecture aliases. Treat plain "arm" as 32-bit ARM, not the
+    # ARM64-class CPU fallback used for Apple Silicon/aarch64 machines.
+    if backend in ("x86_64", "amd64", "i386", "i686"):
+        return "cpu_x86"
+    if backend in ("arm64", "aarch64"):
+        return "cpu_arm"
+
+    # Prefer an explicit CPU architecture field when present
+    if cpu_arch:
+        if cpu_arch in ("x86_64", "amd64", "x86", "i386", "i686"):
+            return "cpu_x86"
+        if cpu_arch in ("arm64", "aarch64"):
+            return "cpu_arm"
+
+    # Apple Silicon enters ranking as backend="metal"; its CPU path is ARM.
+    if backend in ("metal", "mps", "apple") or "apple" in cpu_name or "apple" in gpu_name:
+        return "cpu_arm"
+
+    # Conservative default for CUDA/ROCm/discrete GPU backends and unknowns.
+    return "cpu_x86"
+
+
 def _estimate_speed(model, quant, run_mode, system, offload_frac=0.0):
    """Estimate tok/s. Uses active params for MoE (only active experts run per token).

@@ -143,6 +185,11 @@ def _estimate_speed(model, quant, run_mode, system, offload_frac=0.0):
    bw = _lookup_bandwidth(system)
    backend = system.get("backend", "cpu_x86")

+    # CPU-only inference must never inherit a GPU backend's fallback constant,
+    # even if the detected system happens to report a CUDA/Metal/ROCm backend.
+    if run_mode == "cpu_only":
+        backend = _canonical_cpu_backend(system)
+
    if bw and run_mode in ("gpu", "cpu_offload"):
        bpp = QUANT_BYTES_PER_PARAM.get(quant, 0.5)
        model_gb = pb * bpp
@@ -330,7 +330,7 @@ def _detect_apple_silicon():

    # Only Apple Silicon (arm64) has a Metal GPU worth serving LLMs on; Intel
    # Macs fall through to the CPU path.
-    if "arm" not in arch and "aarch64" not in arch:
+    if _canonical_cpu_arch(arch) != "arm64":
        return None

    # Chip name, e.g. "Apple M4 Max" — carries the Pro/Max/Ultra variant that
@@ -513,6 +513,25 @@ def _get_cpu_count():
    return os.cpu_count() or 1


+def _canonical_cpu_arch(value):
+    arch = str(value or "").lower().strip().replace("-", "_")
+    if arch in ("x86_64", "amd64", "x64"):
+        return "x86_64"
+    if arch in ("i386", "i686", "x86"):
+        return "x86"
+    if arch in ("arm64", "aarch64"):
+        return "arm64"
+    if arch == "arm" or arch.startswith("armv"):
+        return "arm"
+    return arch
+
+
+def _get_cpu_arch():
+    if _remote_host:
+        return _canonical_cpu_arch(_run(["uname", "-m"]) or "")
+    return _canonical_cpu_arch(platform.machine())
+
+
 def _powershell_exe():
    """Pick the best PowerShell executable for LOCAL execution: prefer pwsh
    (PowerShell 7+), fall back to Windows PowerShell 5.1. Returns an absolute
@@ -538,6 +557,7 @@ def _detect_windows():
        $r.cpu_name = $cpu.Name
        $r.cpu_cores = (Get-CimInstance Win32_Processor | Measure-Object -Property NumberOfLogicalProcessors -Sum).Sum
        $r.arch = $cpu.AddressWidth
+        $r.cpu_arch = if ($env:PROCESSOR_ARCHITEW6432) { $env:PROCESSOR_ARCHITEW6432 } else { $env:PROCESSOR_ARCHITECTURE }
        # GPU detection via nvidia-smi (fastest) or WMI fallback
        try { 
            $nv = nvidia-smi --query-gpu=memory.total,name --format=csv,noheader,nounits 2>$null
@@ -609,6 +629,7 @@ def _detect_windows():
            "available_ram_gb": d.get("avail_gb", 0),
            "cpu_cores": _as_int(d.get("cpu_cores"), 1),
            "cpu_name": _cpu_name,
+            "cpu_arch": _canonical_cpu_arch(d.get("cpu_arch")),
            "has_gpu": bool(d.get("gpu_name")),
            "gpu_name": d.get("gpu_name"),
            "gpu_vram_gb": d.get("gpu_vram_gb"),
@@ -804,6 +825,7 @@ def detect_system(host="", ssh_port="", platform="", fresh=False):
    available_ram = round(_get_available_ram_gb(), 1)
    cpu_cores = _get_cpu_count()
    cpu_name = _get_cpu_name()
+    cpu_arch = _get_cpu_arch()

    gpu_info = _detect_apple_silicon() or _detect_nvidia() or _detect_amd()

@@ -813,6 +835,7 @@ def detect_system(host="", ssh_port="", platform="", fresh=False):
            "available_ram_gb": available_ram,
            "cpu_cores": cpu_cores,
            "cpu_name": cpu_name,
+            "cpu_arch": cpu_arch,
            "has_gpu": True,
            "gpu_name": gpu_info["gpu_name"],
            "gpu_vram_gb": gpu_info["gpu_vram_gb"],
@@ -827,17 +850,13 @@ def detect_system(host="", ssh_port="", platform="", fresh=False):
            "unified_memory": gpu_info.get("unified_memory", False),
        }
    else:
-        if _remote_host:
-            arch_out = _run(["uname", "-m"]) or ""
-        else:
-            import platform as _platform
-            arch_out = _platform.machine().lower()
-        backend = "cpu_arm" if "aarch64" in arch_out or "arm" in arch_out else "cpu_x86"
+        backend = "cpu_arm" if cpu_arch == "arm64" else "cpu_x86"
        result = {
            "total_ram_gb": total_ram,
            "available_ram_gb": available_ram,
            "cpu_cores": cpu_cores,
            "cpu_name": cpu_name,
+            "cpu_arch": cpu_arch,
            "has_gpu": False,
            "gpu_name": None,
            "gpu_vram_gb": None,