mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-28 15:45:22 -04:00
Merge origin/dev into main
This commit is contained in:
@@ -19,6 +19,10 @@ GPU_BANDWIDTH = {
|
||||
"6950 xt": 576, "6900 xt": 512, "6800 xt": 512, "6800": 512, "6700 xt": 384, "6600 xt": 256, "6600": 224,
|
||||
"mi300x": 5300, "mi300": 5300, "mi250x": 3277, "mi250": 3277, "mi210": 1638, "mi100": 1229,
|
||||
"9070 xt": 624, "9070": 488, "9060 xt": 322, "9060": 322,
|
||||
# NVIDIA GB10 Grace-Blackwell superchip (DGX Spark). Unified LPDDR5X memory,
|
||||
# not Apple Silicon, so it lives in the generic GPU table — the Apple-only
|
||||
# lookup never matches it (its name carries no "apple").
|
||||
"gb10": 273,
|
||||
}
|
||||
|
||||
# Pre-sort keys by length descending for correct substring matching
|
||||
@@ -126,6 +130,44 @@ def _lookup_bandwidth(system):
|
||||
return None
|
||||
|
||||
|
||||
def _canonical_cpu_backend(system):
|
||||
"""Return the canonical CPU backend for cpu_only speed estimation.
|
||||
|
||||
Normalizes CPU-architecture aliases separately from the GPU backend, and
|
||||
overrides GPU-only backends (CUDA/ROCm/Metal) so they do not inherit a
|
||||
discrete-GPU fallback constant when the model is actually running on CPU.
|
||||
"""
|
||||
backend = (system.get("backend") or "").lower().strip()
|
||||
cpu_arch = (system.get("cpu_arch") or "").lower().strip()
|
||||
cpu_name = (system.get("cpu_name") or "").lower()
|
||||
gpu_name = (system.get("gpu_name") or "").lower()
|
||||
|
||||
# Already-canonical CPU backends
|
||||
if backend in ("cpu_x86", "cpu_arm"):
|
||||
return backend
|
||||
|
||||
# Raw CPU-architecture aliases. Treat plain "arm" as 32-bit ARM, not the
|
||||
# ARM64-class CPU fallback used for Apple Silicon/aarch64 machines.
|
||||
if backend in ("x86_64", "amd64", "i386", "i686"):
|
||||
return "cpu_x86"
|
||||
if backend in ("arm64", "aarch64"):
|
||||
return "cpu_arm"
|
||||
|
||||
# Prefer an explicit CPU architecture field when present
|
||||
if cpu_arch:
|
||||
if cpu_arch in ("x86_64", "amd64", "x86", "i386", "i686"):
|
||||
return "cpu_x86"
|
||||
if cpu_arch in ("arm64", "aarch64"):
|
||||
return "cpu_arm"
|
||||
|
||||
# Apple Silicon enters ranking as backend="metal"; its CPU path is ARM.
|
||||
if backend in ("metal", "mps", "apple") or "apple" in cpu_name or "apple" in gpu_name:
|
||||
return "cpu_arm"
|
||||
|
||||
# Conservative default for CUDA/ROCm/discrete GPU backends and unknowns.
|
||||
return "cpu_x86"
|
||||
|
||||
|
||||
def _estimate_speed(model, quant, run_mode, system, offload_frac=0.0):
|
||||
"""Estimate tok/s. Uses active params for MoE (only active experts run per token).
|
||||
|
||||
@@ -143,6 +185,11 @@ def _estimate_speed(model, quant, run_mode, system, offload_frac=0.0):
|
||||
bw = _lookup_bandwidth(system)
|
||||
backend = system.get("backend", "cpu_x86")
|
||||
|
||||
# CPU-only inference must never inherit a GPU backend's fallback constant,
|
||||
# even if the detected system happens to report a CUDA/Metal/ROCm backend.
|
||||
if run_mode == "cpu_only":
|
||||
backend = _canonical_cpu_backend(system)
|
||||
|
||||
if bw and run_mode in ("gpu", "cpu_offload"):
|
||||
bpp = QUANT_BYTES_PER_PARAM.get(quant, 0.5)
|
||||
model_gb = pb * bpp
|
||||
|
||||
@@ -330,7 +330,7 @@ def _detect_apple_silicon():
|
||||
|
||||
# Only Apple Silicon (arm64) has a Metal GPU worth serving LLMs on; Intel
|
||||
# Macs fall through to the CPU path.
|
||||
if "arm" not in arch and "aarch64" not in arch:
|
||||
if _canonical_cpu_arch(arch) != "arm64":
|
||||
return None
|
||||
|
||||
# Chip name, e.g. "Apple M4 Max" — carries the Pro/Max/Ultra variant that
|
||||
@@ -513,6 +513,25 @@ def _get_cpu_count():
|
||||
return os.cpu_count() or 1
|
||||
|
||||
|
||||
def _canonical_cpu_arch(value):
|
||||
arch = str(value or "").lower().strip().replace("-", "_")
|
||||
if arch in ("x86_64", "amd64", "x64"):
|
||||
return "x86_64"
|
||||
if arch in ("i386", "i686", "x86"):
|
||||
return "x86"
|
||||
if arch in ("arm64", "aarch64"):
|
||||
return "arm64"
|
||||
if arch == "arm" or arch.startswith("armv"):
|
||||
return "arm"
|
||||
return arch
|
||||
|
||||
|
||||
def _get_cpu_arch():
|
||||
if _remote_host:
|
||||
return _canonical_cpu_arch(_run(["uname", "-m"]) or "")
|
||||
return _canonical_cpu_arch(platform.machine())
|
||||
|
||||
|
||||
def _powershell_exe():
|
||||
"""Pick the best PowerShell executable for LOCAL execution: prefer pwsh
|
||||
(PowerShell 7+), fall back to Windows PowerShell 5.1. Returns an absolute
|
||||
@@ -538,6 +557,7 @@ def _detect_windows():
|
||||
$r.cpu_name = $cpu.Name
|
||||
$r.cpu_cores = (Get-CimInstance Win32_Processor | Measure-Object -Property NumberOfLogicalProcessors -Sum).Sum
|
||||
$r.arch = $cpu.AddressWidth
|
||||
$r.cpu_arch = if ($env:PROCESSOR_ARCHITEW6432) { $env:PROCESSOR_ARCHITEW6432 } else { $env:PROCESSOR_ARCHITECTURE }
|
||||
# GPU detection via nvidia-smi (fastest) or WMI fallback
|
||||
try {
|
||||
$nv = nvidia-smi --query-gpu=memory.total,name --format=csv,noheader,nounits 2>$null
|
||||
@@ -609,6 +629,7 @@ def _detect_windows():
|
||||
"available_ram_gb": d.get("avail_gb", 0),
|
||||
"cpu_cores": _as_int(d.get("cpu_cores"), 1),
|
||||
"cpu_name": _cpu_name,
|
||||
"cpu_arch": _canonical_cpu_arch(d.get("cpu_arch")),
|
||||
"has_gpu": bool(d.get("gpu_name")),
|
||||
"gpu_name": d.get("gpu_name"),
|
||||
"gpu_vram_gb": d.get("gpu_vram_gb"),
|
||||
@@ -804,6 +825,7 @@ def detect_system(host="", ssh_port="", platform="", fresh=False):
|
||||
available_ram = round(_get_available_ram_gb(), 1)
|
||||
cpu_cores = _get_cpu_count()
|
||||
cpu_name = _get_cpu_name()
|
||||
cpu_arch = _get_cpu_arch()
|
||||
|
||||
gpu_info = _detect_apple_silicon() or _detect_nvidia() or _detect_amd()
|
||||
|
||||
@@ -813,6 +835,7 @@ def detect_system(host="", ssh_port="", platform="", fresh=False):
|
||||
"available_ram_gb": available_ram,
|
||||
"cpu_cores": cpu_cores,
|
||||
"cpu_name": cpu_name,
|
||||
"cpu_arch": cpu_arch,
|
||||
"has_gpu": True,
|
||||
"gpu_name": gpu_info["gpu_name"],
|
||||
"gpu_vram_gb": gpu_info["gpu_vram_gb"],
|
||||
@@ -827,17 +850,13 @@ def detect_system(host="", ssh_port="", platform="", fresh=False):
|
||||
"unified_memory": gpu_info.get("unified_memory", False),
|
||||
}
|
||||
else:
|
||||
if _remote_host:
|
||||
arch_out = _run(["uname", "-m"]) or ""
|
||||
else:
|
||||
import platform as _platform
|
||||
arch_out = _platform.machine().lower()
|
||||
backend = "cpu_arm" if "aarch64" in arch_out or "arm" in arch_out else "cpu_x86"
|
||||
backend = "cpu_arm" if cpu_arch == "arm64" else "cpu_x86"
|
||||
result = {
|
||||
"total_ram_gb": total_ram,
|
||||
"available_ram_gb": available_ram,
|
||||
"cpu_cores": cpu_cores,
|
||||
"cpu_name": cpu_name,
|
||||
"cpu_arch": cpu_arch,
|
||||
"has_gpu": False,
|
||||
"gpu_name": None,
|
||||
"gpu_vram_gb": None,
|
||||
|
||||
Reference in New Issue
Block a user