mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-16 17:55:26 -04:00
fix(hwfit): detect NVIDIA GPU on WSL and other minimal-PATH environments (#3306)
The nvidia-smi absolute-path fallback in _detect_nvidia() was gated on _remote_host, so it never ran for local detection. On systems where nvidia-smi is not in the default PATH (e.g. WSL: /usr/lib/wsl/lib/), this caused the Cookbook to report 'No GPU' even when nvidia-smi works from an interactive shell. Two issues fixed: 1. Removed the _remote_host gate so the absolute-path scan runs for local detection too. 2. For local execution, pass arguments as a list instead of a string so subprocess.run() resolves the absolute path correctly. Remote (SSH) execution keeps the string form, which the SSH command builder handles. Co-authored-by: Bipin Mishra <bipin.mishra@atlascopco.com>
This commit is contained in:
@@ -76,9 +76,10 @@ def _detect_nvidia():
|
|||||||
global _last_gpu_error
|
global _last_gpu_error
|
||||||
_last_gpu_error = None
|
_last_gpu_error = None
|
||||||
out = _run(["nvidia-smi", "--query-gpu=memory.total,name", "--format=csv,noheader,nounits"])
|
out = _run(["nvidia-smi", "--query-gpu=memory.total,name", "--format=csv,noheader,nounits"])
|
||||||
# Remote fallback: a non-interactive SSH shell often has a minimal PATH
|
# Fallback: a non-interactive shell (or WSL) often has a minimal PATH
|
||||||
# that omits where nvidia-smi lives (/usr/bin, /usr/local/cuda/bin), so the
|
# that omits where nvidia-smi lives (/usr/bin, /usr/local/cuda/bin,
|
||||||
# first call silently returns nothing → "No GPU" on hosts that DO have GPUs.
|
# /usr/lib/wsl/lib), so the first call silently returns nothing →
|
||||||
|
# "No GPU" on machines that DO have GPUs.
|
||||||
# Retry through a login shell with the common CUDA bin dirs on PATH.
|
# Retry through a login shell with the common CUDA bin dirs on PATH.
|
||||||
if not out and _remote_host:
|
if not out and _remote_host:
|
||||||
out = _run(
|
out = _run(
|
||||||
@@ -88,9 +89,16 @@ def _detect_nvidia():
|
|||||||
# Last resort: call nvidia-smi by absolute path. Some hosts have a login
|
# Last resort: call nvidia-smi by absolute path. Some hosts have a login
|
||||||
# shell that isn't bash (or a profile that errors), so the bash -lc retry
|
# shell that isn't bash (or a profile that errors), so the bash -lc retry
|
||||||
# above still comes back empty even though the binary is right there.
|
# above still comes back empty even though the binary is right there.
|
||||||
if not out and _remote_host:
|
# Also handles WSL where nvidia-smi lives at /usr/lib/wsl/lib/ — a path
|
||||||
|
# that may not be in the server process's PATH.
|
||||||
|
if not out:
|
||||||
for _p in ("/usr/bin/nvidia-smi", "/usr/local/bin/nvidia-smi", "/usr/local/cuda/bin/nvidia-smi", "/usr/lib/wsl/lib/nvidia-smi"):
|
for _p in ("/usr/bin/nvidia-smi", "/usr/local/bin/nvidia-smi", "/usr/local/cuda/bin/nvidia-smi", "/usr/lib/wsl/lib/nvidia-smi"):
|
||||||
out = _run(f"{_p} --query-gpu=memory.total,name --format=csv,noheader,nounits")
|
# Use list form so subprocess.run (local) resolves the absolute path
|
||||||
|
# correctly instead of treating the whole string as an executable name.
|
||||||
|
if _remote_host:
|
||||||
|
out = _run(f"{_p} --query-gpu=memory.total,name --format=csv,noheader,nounits")
|
||||||
|
else:
|
||||||
|
out = _run([_p, "--query-gpu=memory.total,name", "--format=csv,noheader,nounits"])
|
||||||
if out:
|
if out:
|
||||||
break
|
break
|
||||||
if not out:
|
if not out:
|
||||||
|
|||||||
Reference in New Issue
Block a user