diff --git a/services/hwfit/fit.py b/services/hwfit/fit.py
index 14865d905..a5a49a7ff 100644
--- a/services/hwfit/fit.py
+++ b/services/hwfit/fit.py
@@ -146,17 +146,18 @@ def _canonical_cpu_backend(system):
     if backend in ("cpu_x86", "cpu_arm"):
         return backend
 
-    # Raw CPU-architecture aliases
+    # Raw CPU-architecture aliases. Treat plain "arm" as 32-bit ARM, not the
+    # ARM64-class CPU fallback used for Apple Silicon/aarch64 machines.
     if backend in ("x86_64", "amd64", "i386", "i686"):
         return "cpu_x86"
-    if backend in ("arm64", "aarch64", "arm"):
+    if backend in ("arm64", "aarch64"):
         return "cpu_arm"
 
     # Prefer an explicit CPU architecture field when present
     if cpu_arch:
         if cpu_arch in ("x86_64", "amd64", "x86", "i386", "i686"):
             return "cpu_x86"
-        if cpu_arch in ("arm64", "aarch64", "arm"):
+        if cpu_arch in ("arm64", "aarch64"):
             return "cpu_arm"
 
     # Apple Silicon enters ranking as backend="metal"; its CPU path is ARM.
diff --git a/services/hwfit/hardware.py b/services/hwfit/hardware.py
index a3ad7ba05..1c4529839 100644
--- a/services/hwfit/hardware.py
+++ b/services/hwfit/hardware.py
@@ -320,7 +320,7 @@ def _detect_apple_silicon():
 
     # Only Apple Silicon (arm64) has a Metal GPU worth serving LLMs on; Intel
     # Macs fall through to the CPU path.
-    if "arm" not in arch and "aarch64" not in arch:
+    if _canonical_cpu_arch(arch) != "arm64":
         return None
 
     # Chip name, e.g. "Apple M4 Max" — carries the Pro/Max/Ultra variant that
@@ -503,6 +503,25 @@ def _get_cpu_count():
     return os.cpu_count() or 1
 
 
+def _canonical_cpu_arch(value):
+    arch = str(value or "").lower().strip().replace("-", "_")
+    if arch in ("x86_64", "amd64", "x64"):
+        return "x86_64"
+    if arch in ("i386", "i686", "x86"):
+        return "x86"
+    if arch in ("arm64", "aarch64"):
+        return "arm64"
+    if arch == "arm" or arch.startswith("armv"):
+        return "arm"
+    return arch
+
+
+def _get_cpu_arch():
+    if _remote_host:
+        return _canonical_cpu_arch(_run(["uname", "-m"]) or "")
+    return _canonical_cpu_arch(platform.machine())
+
+
 def _powershell_exe():
     """Pick the best PowerShell executable for LOCAL execution: prefer pwsh
     (PowerShell 7+), fall back to Windows PowerShell 5.1. Returns an absolute
@@ -528,6 +547,7 @@ def _detect_windows():
         $r.cpu_name = $cpu.Name
         $r.cpu_cores = (Get-CimInstance Win32_Processor | Measure-Object -Property NumberOfLogicalProcessors -Sum).Sum
         $r.arch = $cpu.AddressWidth
+        $r.cpu_arch = if ($env:PROCESSOR_ARCHITEW6432) { $env:PROCESSOR_ARCHITEW6432 } else { $env:PROCESSOR_ARCHITECTURE }
         # GPU detection via nvidia-smi (fastest) or WMI fallback
         try { 
             $nv = nvidia-smi --query-gpu=memory.total,name --format=csv,noheader,nounits 2>$null
@@ -599,6 +619,7 @@ def _detect_windows():
             "available_ram_gb": d.get("avail_gb", 0),
             "cpu_cores": _as_int(d.get("cpu_cores"), 1),
             "cpu_name": _cpu_name,
+            "cpu_arch": _canonical_cpu_arch(d.get("cpu_arch")),
             "has_gpu": bool(d.get("gpu_name")),
             "gpu_name": d.get("gpu_name"),
             "gpu_vram_gb": d.get("gpu_vram_gb"),
@@ -794,6 +815,7 @@ def detect_system(host="", ssh_port="", platform="", fresh=False):
     available_ram = round(_get_available_ram_gb(), 1)
     cpu_cores = _get_cpu_count()
     cpu_name = _get_cpu_name()
+    cpu_arch = _get_cpu_arch()
 
     gpu_info = _detect_apple_silicon() or _detect_nvidia() or _detect_amd()
 
@@ -803,6 +825,7 @@ def detect_system(host="", ssh_port="", platform="", fresh=False):
             "available_ram_gb": available_ram,
             "cpu_cores": cpu_cores,
             "cpu_name": cpu_name,
+            "cpu_arch": cpu_arch,
             "has_gpu": True,
             "gpu_name": gpu_info["gpu_name"],
             "gpu_vram_gb": gpu_info["gpu_vram_gb"],
@@ -817,17 +840,13 @@ def detect_system(host="", ssh_port="", platform="", fresh=False):
             "unified_memory": gpu_info.get("unified_memory", False),
         }
     else:
-        if _remote_host:
-            arch_out = _run(["uname", "-m"]) or ""
-        else:
-            import platform as _platform
-            arch_out = _platform.machine().lower()
-        backend = "cpu_arm" if "aarch64" in arch_out or "arm" in arch_out else "cpu_x86"
+        backend = "cpu_arm" if cpu_arch == "arm64" else "cpu_x86"
         result = {
             "total_ram_gb": total_ram,
             "available_ram_gb": available_ram,
             "cpu_cores": cpu_cores,
             "cpu_name": cpu_name,
+            "cpu_arch": cpu_arch,
             "has_gpu": False,
             "gpu_name": None,
             "gpu_vram_gb": None,
diff --git a/tests/test_hwfit_cpu_arch_detection.py b/tests/test_hwfit_cpu_arch_detection.py
new file mode 100644
index 000000000..b2b6fba8f
--- /dev/null
+++ b/tests/test_hwfit_cpu_arch_detection.py
@@ -0,0 +1,55 @@
+"""CPU architecture normalization for HW Fit hardware detection."""
+
+import pytest
+
+from services.hwfit import hardware
+
+
+@pytest.fixture(autouse=True)
+def _clear_hwfit_cache(monkeypatch):
+    hardware._cache_by_host.clear()
+    monkeypatch.setattr(hardware, "_remote_host", None)
+    monkeypatch.setattr(hardware, "_remote_platform", None)
+    monkeypatch.setattr(hardware, "_is_containerized", lambda: False)
+    yield
+    hardware._cache_by_host.clear()
+
+
+def _stub_common_probe(monkeypatch, machine):
+    monkeypatch.setattr(hardware.platform, "machine", lambda: machine)
+    monkeypatch.setattr(hardware, "_get_ram_gb", lambda: 64.0)
+    monkeypatch.setattr(hardware, "_get_available_ram_gb", lambda: 48.0)
+    monkeypatch.setattr(hardware, "_get_cpu_count", lambda: 16)
+    monkeypatch.setattr(hardware, "_get_cpu_name", lambda: "Test CPU")
+    monkeypatch.setattr(hardware, "_detect_apple_silicon", lambda: None)
+    monkeypatch.setattr(hardware, "_detect_amd", lambda: None)
+
+
+def test_detect_system_reports_cpu_arch_for_gpu_backends(monkeypatch):
+    """GPU-backed systems still need CPU architecture for cpu_only estimates."""
+    _stub_common_probe(monkeypatch, "aarch64")
+    monkeypatch.setattr(hardware, "_detect_nvidia", lambda: {
+        "gpu_name": "NVIDIA GB10",
+        "gpu_vram_gb": 64.0,
+        "gpu_count": 1,
+        "gpus": [],
+        "gpu_groups": [],
+        "homogeneous": True,
+        "backend": "cuda",
+    })
+
+    system = hardware.detect_system(fresh=True)
+
+    assert system["backend"] == "cuda"
+    assert system["cpu_arch"] == "arm64"
+
+
+def test_detect_system_keeps_32_bit_arm_on_conservative_cpu_backend(monkeypatch):
+    """Plain arm/armv7 is not the same as the ARM64-class cpu_arm fallback."""
+    _stub_common_probe(monkeypatch, "armv7l")
+    monkeypatch.setattr(hardware, "_detect_nvidia", lambda: None)
+
+    system = hardware.detect_system(fresh=True)
+
+    assert system["cpu_arch"] == "arm"
+    assert system["backend"] == "cpu_x86"
diff --git a/tests/test_hwfit_cpu_only_fallback.py b/tests/test_hwfit_cpu_only_fallback.py
index 765f99051..826684fca 100644
--- a/tests/test_hwfit_cpu_only_fallback.py
+++ b/tests/test_hwfit_cpu_only_fallback.py
@@ -47,6 +47,12 @@ ARM64_SYSTEM = {
     "gpu_vram_gb": 0,
 }
 
+ARM32_SYSTEM = {
+    "backend": "arm",
+    "gpu_name": None,
+    "gpu_vram_gb": 0,
+}
+
 AARCH64_SYSTEM = {
     "backend": "aarch64",
     "gpu_name": None,
@@ -79,6 +85,16 @@ def test_cpu_only_on_metal_apple_silicon_uses_cpu_arm_fallback():
     assert metal_tps > 0
 
 
+def test_cpu_only_on_gpu_backend_uses_detected_arm64_cpu_arch():
+    """A GPU backend on an ARM64 host should use the ARM CPU fallback for cpu_only."""
+    cuda_arm64 = dict(CUDA_SYSTEM, cpu_arch="aarch64", cpu_name="Ampere Altra")
+    cuda_arm64_tps = _estimate_speed(DENSE_MODEL, QUANT, "cpu_only", cuda_arm64)
+    arm_tps = _estimate_speed(DENSE_MODEL, QUANT, "cpu_only", CPU_ARM_SYSTEM)
+
+    assert cuda_arm64_tps == pytest.approx(arm_tps, rel=1e-9, abs=1e-9)
+    assert cuda_arm64_tps > 0
+
+
 @pytest.mark.parametrize(
     "arm_alias_system",
     [ARM64_SYSTEM, AARCH64_SYSTEM, CPU_ARM_SYSTEM],
@@ -93,6 +109,15 @@ def test_cpu_only_preserves_arm_backends(arm_alias_system):
     assert alias_tps > 0
 
 
+def test_cpu_only_does_not_treat_plain_arm_as_arm64_fallback():
+    """Docker/OCI plain arm is not the ARM64-class fallback used for Apple Silicon."""
+    arm32_tps = _estimate_speed(DENSE_MODEL, QUANT, "cpu_only", ARM32_SYSTEM)
+    x86_tps = _estimate_speed(DENSE_MODEL, QUANT, "cpu_only", CPU_X86_SYSTEM)
+
+    assert arm32_tps == pytest.approx(x86_tps, rel=1e-9, abs=1e-9)
+    assert arm32_tps > 0
+
+
 def test_cpu_only_preserves_known_cpu_backends():
     """Known CPU backends should be preserved, not rewritten to cpu_x86."""
     for system in (CPU_X86_SYSTEM, CPU_ARM_SYSTEM):
diff --git a/tests/test_hwfit_macos.py b/tests/test_hwfit_macos.py
index a979d14eb..f81cc9b38 100644
--- a/tests/test_hwfit_macos.py
+++ b/tests/test_hwfit_macos.py
@@ -165,6 +165,15 @@ def test_intel_mac_skipped(monkeypatch):
     assert hardware._detect_apple_silicon() is None
 
 
+def test_plain_arm_mac_skipped(monkeypatch):
+    """Only ARM64-class Macs should enter the Apple Silicon Metal path."""
+    monkeypatch.setattr(hardware, "_remote_host", None)
+    monkeypatch.setattr(hardware.platform, "system", lambda: "Darwin")
+    monkeypatch.setattr(hardware.platform, "machine", lambda: "armv7l")
+    monkeypatch.setattr(hardware, "_run", _fake_sysctl())
+    assert hardware._detect_apple_silicon() is None
+
+
 def test_detect_system_propagates_unified_memory(monkeypatch):
     """The unified_memory flag set by GPU detection must survive into the
     system dict so the API and UI can report it (it was being dropped)."""