diff --git a/services/hwfit/fit.py b/services/hwfit/fit.py index 242050e7a..14865d905 100644 --- a/services/hwfit/fit.py +++ b/services/hwfit/fit.py @@ -130,6 +130,43 @@ def _lookup_bandwidth(system): return None +def _canonical_cpu_backend(system): + """Return the canonical CPU backend for cpu_only speed estimation. + + Normalizes CPU-architecture aliases separately from the GPU backend, and + overrides GPU-only backends (CUDA/ROCm/Metal) so they do not inherit a + discrete-GPU fallback constant when the model is actually running on CPU. + """ + backend = (system.get("backend") or "").lower().strip() + cpu_arch = (system.get("cpu_arch") or "").lower().strip() + cpu_name = (system.get("cpu_name") or "").lower() + gpu_name = (system.get("gpu_name") or "").lower() + + # Already-canonical CPU backends + if backend in ("cpu_x86", "cpu_arm"): + return backend + + # Raw CPU-architecture aliases + if backend in ("x86_64", "amd64", "i386", "i686"): + return "cpu_x86" + if backend in ("arm64", "aarch64", "arm"): + return "cpu_arm" + + # Prefer an explicit CPU architecture field when present + if cpu_arch: + if cpu_arch in ("x86_64", "amd64", "x86", "i386", "i686"): + return "cpu_x86" + if cpu_arch in ("arm64", "aarch64", "arm"): + return "cpu_arm" + + # Apple Silicon enters ranking as backend="metal"; its CPU path is ARM. + if backend in ("metal", "mps", "apple") or "apple" in cpu_name or "apple" in gpu_name: + return "cpu_arm" + + # Conservative default for CUDA/ROCm/discrete GPU backends and unknowns. + return "cpu_x86" + + def _estimate_speed(model, quant, run_mode, system, offload_frac=0.0): """Estimate tok/s. Uses active params for MoE (only active experts run per token). @@ -147,6 +184,11 @@ def _estimate_speed(model, quant, run_mode, system, offload_frac=0.0): bw = _lookup_bandwidth(system) backend = system.get("backend", "cpu_x86") + # CPU-only inference must never inherit a GPU backend's fallback constant, + # even if the detected system happens to report a CUDA/Metal/ROCm backend. + if run_mode == "cpu_only": + backend = _canonical_cpu_backend(system) + if bw and run_mode in ("gpu", "cpu_offload"): bpp = QUANT_BYTES_PER_PARAM.get(quant, 0.5) model_gb = pb * bpp diff --git a/tests/test_hwfit_cpu_only_fallback.py b/tests/test_hwfit_cpu_only_fallback.py new file mode 100644 index 000000000..765f99051 --- /dev/null +++ b/tests/test_hwfit_cpu_only_fallback.py @@ -0,0 +1,115 @@ +"""Regression test for cpu_only backend fallback in hwfit speed estimation.""" + +import pytest + +from services.hwfit.fit import _estimate_speed + + +DENSE_MODEL = { + "name": "Test-7B", + "parameter_count": "7B", + "parameters_raw": 7_000_000_000, +} + +CUDA_SYSTEM = { + "backend": "cuda", + "gpu_name": "NVIDIA RTX 4090", + "gpu_vram_gb": 24.0, +} + +CPU_X86_SYSTEM = { + "backend": "cpu_x86", + "gpu_name": None, + "gpu_vram_gb": 0, +} + +CPU_ARM_SYSTEM = { + "backend": "cpu_arm", + "gpu_name": None, + "gpu_vram_gb": 0, +} + +METAL_SYSTEM = { + "backend": "metal", + "gpu_name": "Apple M3 Max", + "gpu_vram_gb": 36.0, +} + +ROCM_SYSTEM = { + "backend": "rocm", + "gpu_name": "AMD Radeon RX 7900 XTX", + "gpu_vram_gb": 24.0, +} + +ARM64_SYSTEM = { + "backend": "arm64", + "gpu_name": None, + "gpu_vram_gb": 0, +} + +AARCH64_SYSTEM = { + "backend": "aarch64", + "gpu_name": None, + "gpu_vram_gb": 0, +} + +QUANT = "Q4_K_M" + + +@pytest.mark.parametrize( + "non_cpu_system", + [CUDA_SYSTEM, ROCM_SYSTEM], + ids=["cuda", "rocm"], +) +def test_cpu_only_on_non_cpu_backend_uses_cpu_x86_fallback(non_cpu_system): + """cpu_only must ignore discrete GPU backends and use the x86 CPU fallback constant.""" + non_cpu_tps = _estimate_speed(DENSE_MODEL, QUANT, "cpu_only", non_cpu_system) + cpu_tps = _estimate_speed(DENSE_MODEL, QUANT, "cpu_only", CPU_X86_SYSTEM) + + assert non_cpu_tps == pytest.approx(cpu_tps, rel=1e-9, abs=1e-9) + assert non_cpu_tps > 0 + + +def test_cpu_only_on_metal_apple_silicon_uses_cpu_arm_fallback(): + """Apple Silicon/Metal cpu_only should map to the ARM CPU fallback constant.""" + metal_tps = _estimate_speed(DENSE_MODEL, QUANT, "cpu_only", METAL_SYSTEM) + arm_tps = _estimate_speed(DENSE_MODEL, QUANT, "cpu_only", CPU_ARM_SYSTEM) + + assert metal_tps == pytest.approx(arm_tps, rel=1e-9, abs=1e-9) + assert metal_tps > 0 + + +@pytest.mark.parametrize( + "arm_alias_system", + [ARM64_SYSTEM, AARCH64_SYSTEM, CPU_ARM_SYSTEM], + ids=["arm64", "aarch64", "cpu_arm"], +) +def test_cpu_only_preserves_arm_backends(arm_alias_system): + """ARM CPU backends and their aliases must stay on the ARM CPU fallback.""" + alias_tps = _estimate_speed(DENSE_MODEL, QUANT, "cpu_only", arm_alias_system) + arm_tps = _estimate_speed(DENSE_MODEL, QUANT, "cpu_only", CPU_ARM_SYSTEM) + + assert alias_tps == pytest.approx(arm_tps, rel=1e-9, abs=1e-9) + assert alias_tps > 0 + + +def test_cpu_only_preserves_known_cpu_backends(): + """Known CPU backends should be preserved, not rewritten to cpu_x86.""" + for system in (CPU_X86_SYSTEM, CPU_ARM_SYSTEM): + tps = _estimate_speed(DENSE_MODEL, QUANT, "cpu_only", system) + assert tps > 0 + + # The two CPU backends use different fallback constants, so their results + # must differ (cpu_arm is faster in the fallback table than cpu_x86). + x86_tps = _estimate_speed(DENSE_MODEL, QUANT, "cpu_only", CPU_X86_SYSTEM) + arm_tps = _estimate_speed(DENSE_MODEL, QUANT, "cpu_only", CPU_ARM_SYSTEM) + assert arm_tps != x86_tps + assert arm_tps > x86_tps + + +def test_cpu_only_on_cuda_is_slower_than_gpu_path(): + """The CPU-only estimate on a CUDA system must not exceed the GPU path.""" + cpu_only_tps = _estimate_speed(DENSE_MODEL, QUANT, "cpu_only", CUDA_SYSTEM) + gpu_tps = _estimate_speed(DENSE_MODEL, QUANT, "gpu", CUDA_SYSTEM) + + assert cpu_only_tps < gpu_tps