diff --git a/services/hwfit/fit.py b/services/hwfit/fit.py
index 242050e7a..14865d905 100644
--- a/services/hwfit/fit.py
+++ b/services/hwfit/fit.py
@@ -130,6 +130,43 @@ def _lookup_bandwidth(system):
     return None
 
 
+def _canonical_cpu_backend(system):
+    """Return the canonical CPU backend for cpu_only speed estimation.
+
+    Normalizes CPU-architecture aliases separately from the GPU backend, and
+    overrides GPU-only backends (CUDA/ROCm/Metal) so they do not inherit a
+    discrete-GPU fallback constant when the model is actually running on CPU.
+    """
+    backend = (system.get("backend") or "").lower().strip()
+    cpu_arch = (system.get("cpu_arch") or "").lower().strip()
+    cpu_name = (system.get("cpu_name") or "").lower()
+    gpu_name = (system.get("gpu_name") or "").lower()
+
+    # Already-canonical CPU backends
+    if backend in ("cpu_x86", "cpu_arm"):
+        return backend
+
+    # Raw CPU-architecture aliases
+    if backend in ("x86_64", "amd64", "i386", "i686"):
+        return "cpu_x86"
+    if backend in ("arm64", "aarch64", "arm"):
+        return "cpu_arm"
+
+    # Prefer an explicit CPU architecture field when present
+    if cpu_arch:
+        if cpu_arch in ("x86_64", "amd64", "x86", "i386", "i686"):
+            return "cpu_x86"
+        if cpu_arch in ("arm64", "aarch64", "arm"):
+            return "cpu_arm"
+
+    # Apple Silicon enters ranking as backend="metal"; its CPU path is ARM.
+    if backend in ("metal", "mps", "apple") or "apple" in cpu_name or "apple" in gpu_name:
+        return "cpu_arm"
+
+    # Conservative default for CUDA/ROCm/discrete GPU backends and unknowns.
+    return "cpu_x86"
+
+
 def _estimate_speed(model, quant, run_mode, system, offload_frac=0.0):
     """Estimate tok/s. Uses active params for MoE (only active experts run per token).
 
@@ -147,6 +184,11 @@ def _estimate_speed(model, quant, run_mode, system, offload_frac=0.0):
     bw = _lookup_bandwidth(system)
     backend = system.get("backend", "cpu_x86")
 
+    # CPU-only inference must never inherit a GPU backend's fallback constant,
+    # even if the detected system happens to report a CUDA/Metal/ROCm backend.
+    if run_mode == "cpu_only":
+        backend = _canonical_cpu_backend(system)
+
     if bw and run_mode in ("gpu", "cpu_offload"):
         bpp = QUANT_BYTES_PER_PARAM.get(quant, 0.5)
         model_gb = pb * bpp
diff --git a/tests/test_hwfit_cpu_only_fallback.py b/tests/test_hwfit_cpu_only_fallback.py
new file mode 100644
index 000000000..765f99051
--- /dev/null
+++ b/tests/test_hwfit_cpu_only_fallback.py
@@ -0,0 +1,115 @@
+"""Regression test for cpu_only backend fallback in hwfit speed estimation."""
+
+import pytest
+
+from services.hwfit.fit import _estimate_speed
+
+
+DENSE_MODEL = {
+    "name": "Test-7B",
+    "parameter_count": "7B",
+    "parameters_raw": 7_000_000_000,
+}
+
+CUDA_SYSTEM = {
+    "backend": "cuda",
+    "gpu_name": "NVIDIA RTX 4090",
+    "gpu_vram_gb": 24.0,
+}
+
+CPU_X86_SYSTEM = {
+    "backend": "cpu_x86",
+    "gpu_name": None,
+    "gpu_vram_gb": 0,
+}
+
+CPU_ARM_SYSTEM = {
+    "backend": "cpu_arm",
+    "gpu_name": None,
+    "gpu_vram_gb": 0,
+}
+
+METAL_SYSTEM = {
+    "backend": "metal",
+    "gpu_name": "Apple M3 Max",
+    "gpu_vram_gb": 36.0,
+}
+
+ROCM_SYSTEM = {
+    "backend": "rocm",
+    "gpu_name": "AMD Radeon RX 7900 XTX",
+    "gpu_vram_gb": 24.0,
+}
+
+ARM64_SYSTEM = {
+    "backend": "arm64",
+    "gpu_name": None,
+    "gpu_vram_gb": 0,
+}
+
+AARCH64_SYSTEM = {
+    "backend": "aarch64",
+    "gpu_name": None,
+    "gpu_vram_gb": 0,
+}
+
+QUANT = "Q4_K_M"
+
+
+@pytest.mark.parametrize(
+    "non_cpu_system",
+    [CUDA_SYSTEM, ROCM_SYSTEM],
+    ids=["cuda", "rocm"],
+)
+def test_cpu_only_on_non_cpu_backend_uses_cpu_x86_fallback(non_cpu_system):
+    """cpu_only must ignore discrete GPU backends and use the x86 CPU fallback constant."""
+    non_cpu_tps = _estimate_speed(DENSE_MODEL, QUANT, "cpu_only", non_cpu_system)
+    cpu_tps = _estimate_speed(DENSE_MODEL, QUANT, "cpu_only", CPU_X86_SYSTEM)
+
+    assert non_cpu_tps == pytest.approx(cpu_tps, rel=1e-9, abs=1e-9)
+    assert non_cpu_tps > 0
+
+
+def test_cpu_only_on_metal_apple_silicon_uses_cpu_arm_fallback():
+    """Apple Silicon/Metal cpu_only should map to the ARM CPU fallback constant."""
+    metal_tps = _estimate_speed(DENSE_MODEL, QUANT, "cpu_only", METAL_SYSTEM)
+    arm_tps = _estimate_speed(DENSE_MODEL, QUANT, "cpu_only", CPU_ARM_SYSTEM)
+
+    assert metal_tps == pytest.approx(arm_tps, rel=1e-9, abs=1e-9)
+    assert metal_tps > 0
+
+
+@pytest.mark.parametrize(
+    "arm_alias_system",
+    [ARM64_SYSTEM, AARCH64_SYSTEM, CPU_ARM_SYSTEM],
+    ids=["arm64", "aarch64", "cpu_arm"],
+)
+def test_cpu_only_preserves_arm_backends(arm_alias_system):
+    """ARM CPU backends and their aliases must stay on the ARM CPU fallback."""
+    alias_tps = _estimate_speed(DENSE_MODEL, QUANT, "cpu_only", arm_alias_system)
+    arm_tps = _estimate_speed(DENSE_MODEL, QUANT, "cpu_only", CPU_ARM_SYSTEM)
+
+    assert alias_tps == pytest.approx(arm_tps, rel=1e-9, abs=1e-9)
+    assert alias_tps > 0
+
+
+def test_cpu_only_preserves_known_cpu_backends():
+    """Known CPU backends should be preserved, not rewritten to cpu_x86."""
+    for system in (CPU_X86_SYSTEM, CPU_ARM_SYSTEM):
+        tps = _estimate_speed(DENSE_MODEL, QUANT, "cpu_only", system)
+        assert tps > 0
+
+    # The two CPU backends use different fallback constants, so their results
+    # must differ (cpu_arm is faster in the fallback table than cpu_x86).
+    x86_tps = _estimate_speed(DENSE_MODEL, QUANT, "cpu_only", CPU_X86_SYSTEM)
+    arm_tps = _estimate_speed(DENSE_MODEL, QUANT, "cpu_only", CPU_ARM_SYSTEM)
+    assert arm_tps != x86_tps
+    assert arm_tps > x86_tps
+
+
+def test_cpu_only_on_cuda_is_slower_than_gpu_path():
+    """The CPU-only estimate on a CUDA system must not exceed the GPU path."""
+    cpu_only_tps = _estimate_speed(DENSE_MODEL, QUANT, "cpu_only", CUDA_SYSTEM)
+    gpu_tps = _estimate_speed(DENSE_MODEL, QUANT, "gpu", CUDA_SYSTEM)
+
+    assert cpu_only_tps < gpu_tps