From b22c2b280cf9d1bb85c6de4087818c9df0befd8f Mon Sep 17 00:00:00 2001 From: Bipin Mishra <61102500+bipin-mishra1@users.noreply.github.com> Date: Sun, 7 Jun 2026 21:23:49 +0530 Subject: [PATCH] fix(hwfit): detect NVIDIA GPU on WSL and other minimal-PATH environments (#3306) The nvidia-smi absolute-path fallback in _detect_nvidia() was gated on _remote_host, so it never ran for local detection. On systems where nvidia-smi is not in the default PATH (e.g. WSL: /usr/lib/wsl/lib/), this caused the Cookbook to report 'No GPU' even when nvidia-smi works from an interactive shell. Two issues fixed: 1. Removed the _remote_host gate so the absolute-path scan runs for local detection too. 2. For local execution, pass arguments as a list instead of a string so subprocess.run() resolves the absolute path correctly. Remote (SSH) execution keeps the string form, which the SSH command builder handles. Co-authored-by: Bipin Mishra --- services/hwfit/hardware.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/services/hwfit/hardware.py b/services/hwfit/hardware.py index db48d1842..2b47ffa2a 100644 --- a/services/hwfit/hardware.py +++ b/services/hwfit/hardware.py @@ -76,9 +76,10 @@ def _detect_nvidia(): global _last_gpu_error _last_gpu_error = None out = _run(["nvidia-smi", "--query-gpu=memory.total,name", "--format=csv,noheader,nounits"]) - # Remote fallback: a non-interactive SSH shell often has a minimal PATH - # that omits where nvidia-smi lives (/usr/bin, /usr/local/cuda/bin), so the - # first call silently returns nothing → "No GPU" on hosts that DO have GPUs. + # Fallback: a non-interactive shell (or WSL) often has a minimal PATH + # that omits where nvidia-smi lives (/usr/bin, /usr/local/cuda/bin, + # /usr/lib/wsl/lib), so the first call silently returns nothing → + # "No GPU" on machines that DO have GPUs. # Retry through a login shell with the common CUDA bin dirs on PATH. if not out and _remote_host: out = _run( @@ -88,9 +89,16 @@ def _detect_nvidia(): # Last resort: call nvidia-smi by absolute path. Some hosts have a login # shell that isn't bash (or a profile that errors), so the bash -lc retry # above still comes back empty even though the binary is right there. - if not out and _remote_host: + # Also handles WSL where nvidia-smi lives at /usr/lib/wsl/lib/ — a path + # that may not be in the server process's PATH. + if not out: for _p in ("/usr/bin/nvidia-smi", "/usr/local/bin/nvidia-smi", "/usr/local/cuda/bin/nvidia-smi", "/usr/lib/wsl/lib/nvidia-smi"): - out = _run(f"{_p} --query-gpu=memory.total,name --format=csv,noheader,nounits") + # Use list form so subprocess.run (local) resolves the absolute path + # correctly instead of treating the whole string as an executable name. + if _remote_host: + out = _run(f"{_p} --query-gpu=memory.total,name --format=csv,noheader,nounits") + else: + out = _run([_p, "--query-gpu=memory.total,name", "--format=csv,noheader,nounits"]) if out: break if not out: