From 1324e1b0d593bb69af965084782c1cffc3b35623 Mon Sep 17 00:00:00 2001 From: pewdiepie-archdaemon Date: Fri, 19 Jun 2026 00:33:07 +0000 Subject: [PATCH] Cookbook backend detection: report Vulkan on AMD hosts without ROCm; gate CUDA build on actual NVIDIA hardware Three classes of incorrect detection fixed: (1) AMD GPU + no ROCm installed (e.g. Strix Halo) was reported as backend=rocm everywhere, so launch commands emitted HIP_VISIBLE_DEVICES (silent no-op on Vulkan) and the from-source build path failed. Both _probe_amd_sysfs (routes/cookbook_routes) and _detect_amd (services/hwfit/hardware) now probe rocminfo / hipconfig / vulkaninfo at detection time and report vulkan when only Vulkan is present. (2) Build helper was picking the CUDA branch on AMD hosts whenever a stray pip-installed nvcc was on PATH (vLLM wheels carry one without libcudart). Added _odysseus_has_nvidia_hw() that checks nvidia-smi / /dev/nvidia* / lspci, and gates both the nvcc PATH augmentation and the CUDA elif branch on real hardware. (3) Build chain reordered to ROCm/HIP > CUDA > Vulkan > CPU. Vulkan tier added between CUDA and CPU as a portable fallback for hosts with a GPU but no native toolchain (the common Strix Halo case). Same _append_llama_cpp_linux_accel_build_lines also auto-attempts sudo -n apt/pacman/dnf install of cmake/build-essential/git when they are missing, surfacing a clear no-passwordless-sudo warning otherwise. --- routes/cookbook_helpers.py | 177 ++++++++++++++++++++++++++++++++++--- routes/cookbook_routes.py | 124 ++++++++++++++++++++++++-- services/hwfit/hardware.py | 12 ++- 3 files changed, 293 insertions(+), 20 deletions(-) diff --git a/routes/cookbook_helpers.py b/routes/cookbook_helpers.py index bb819f3f8..3600a9ad1 100644 --- a/routes/cookbook_helpers.py +++ b/routes/cookbook_helpers.py @@ -784,25 +784,149 @@ def _append_llama_cpp_linux_accel_build_lines(runner_lines: list[str]) -> None: to hard-wire CUDA on Linux. That made ROCm hosts attempt a CUDA configure and fail with "CUDA Toolkit not found" instead of building with HIP. """ + # Try a prebuilt binary from llama.cpp's GitHub releases FIRST — no + # cmake/build-essential/git/CUDA-headers needed at all. The from-source + # build below stays as a fallback (custom flags, esoteric arch, no + # internet, etc). 30 seconds vs 5+ minutes of compile, and removes + # every OS-package dep from the launch path. Sets _odysseus_have_prebuilt=1 + # on success; the existing build-tier if/elif chain below is gated on + # that variable so we never compile twice or shadow the prebuilt symlink. + runner_lines.append(' _odysseus_have_prebuilt=""') + runner_lines.append(' _odysseus_arch="$(uname -m)"') + runner_lines.append(' _odysseus_prebuilt_url=""') + runner_lines.append(' if command -v curl >/dev/null 2>&1 && [ "$_odysseus_arch" = "x86_64" ]; then') + runner_lines.append(' _odysseus_pat=""') + runner_lines.append(' _odysseus_has_nv_inline() { command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi -L 2>/dev/null | grep -q "GPU "; }') + runner_lines.append(' _odysseus_has_vk_inline() { ldconfig -p 2>/dev/null | grep -q "libvulkan\\.so" || command -v vulkaninfo >/dev/null 2>&1 || [ -e /usr/lib/x86_64-linux-gnu/libvulkan.so.1 ]; }') + runner_lines.append(' _odysseus_has_vkdev_inline() { ls /dev/dri/renderD* >/dev/null 2>&1 || (lspci 2>/dev/null | grep -Ei \'VGA|3D|Display\' | grep -Eiq \'AMD|ATI|Radeon\'); }') + runner_lines.append(' if _odysseus_has_nv_inline; then') + runner_lines.append(' _odysseus_pat="ubuntu.*cuda"') + runner_lines.append(' elif _odysseus_has_vkdev_inline && _odysseus_has_vk_inline; then') + runner_lines.append(' _odysseus_pat="ubuntu.*vulkan"') + runner_lines.append(' else') + runner_lines.append(' _odysseus_pat="ubuntu-x64\\\\.zip"') + runner_lines.append(' fi') + runner_lines.append(' _odysseus_prebuilt_url="$(curl -fsSL --max-time 15 https://api.github.com/repos/ggml-org/llama.cpp/releases/latest 2>/dev/null | grep \'"browser_download_url"\' | cut -d\'"\' -f4 | grep -iE "$_odysseus_pat" | grep -iv "arm\\|aarch64" | head -1)"') + runner_lines.append(' fi') + # Accept any of unzip / bsdtar / python3 -m zipfile as the extractor. + # python3 is essentially always present on modern Linux, so this lets + # the prebuilt path work on minimal Ubuntu installs that lack `unzip`. + runner_lines.append(' if [ -n "$_odysseus_prebuilt_url" ] && (command -v unzip >/dev/null 2>&1 || command -v bsdtar >/dev/null 2>&1 || command -v python3 >/dev/null 2>&1); then') + runner_lines.append(' echo "[odysseus] Found prebuilt llama-server: $_odysseus_prebuilt_url"') + runner_lines.append(' mkdir -p ~/bin "$HOME/.cache/odysseus/llama-cpp-prebuilt" && cd "$HOME/.cache/odysseus/llama-cpp-prebuilt"') + runner_lines.append(' rm -f llama-cpp.zip') + runner_lines.append(' if curl -fsSL --max-time 120 "$_odysseus_prebuilt_url" -o llama-cpp.zip && [ -s llama-cpp.zip ]; then') + runner_lines.append(' rm -rf build && mkdir -p build') + runner_lines.append(' if command -v unzip >/dev/null 2>&1; then unzip -qq -o llama-cpp.zip -d build; elif command -v bsdtar >/dev/null 2>&1; then bsdtar -xf llama-cpp.zip -C build; else python3 -c "import zipfile; zipfile.ZipFile(\\"llama-cpp.zip\\").extractall(\\"build\\")"; fi') + runner_lines.append(' _odysseus_extracted="$(find build -type f -name llama-server 2>/dev/null | head -1)"') + runner_lines.append(' if [ -n "$_odysseus_extracted" ]; then') + runner_lines.append(' chmod +x "$_odysseus_extracted"') + runner_lines.append(' ln -sf "$_odysseus_extracted" ~/bin/llama-server') + runner_lines.append(' _odysseus_libdir="$(dirname "$_odysseus_extracted")"') + runner_lines.append(' mkdir -p ~/.config && echo "export LD_LIBRARY_PATH=\\"$_odysseus_libdir:\\${LD_LIBRARY_PATH:-}\\"" > ~/.config/odysseus-llama-cpp-env') + runner_lines.append(' _odysseus_have_prebuilt=1') + runner_lines.append(' echo "[odysseus] Prebuilt llama-server installed at $_odysseus_extracted"') + runner_lines.append(' fi') + runner_lines.append(' fi') + runner_lines.append(' [ -z "$_odysseus_have_prebuilt" ] && echo "[odysseus] Prebuilt download/extract failed — falling back to from-source build."') + runner_lines.append(' elif [ -z "$_odysseus_prebuilt_url" ]; then') + runner_lines.append(' echo "[odysseus] No matching prebuilt llama-server for this host (arch=$_odysseus_arch) — will build from source."') + runner_lines.append(' fi') + runner_lines.append(' if [ -z "$_odysseus_have_prebuilt" ]; then') # Detect pip-installed nvcc (from vLLM/nvidia CUDA wheels) and put it on PATH - # so cmake's CUDA configure can find it. We keep this after the ROCm/HIP - # check — a machine with both stacks should honor the native HIP toolchain on - # AMD hosts instead of accidentally preferring a stray nvcc wheel. - runner_lines.append(' for _cudir in ~/.local/lib/python*/site-packages/nvidia/cu13 ~/.local/lib/python*/site-packages/nvidia/cu12 ~/.local/lib/python*/site-packages/nvidia/cuda_nvcc; do') - runner_lines.append(' [ -x "$_cudir/bin/nvcc" ] && export CUDA_HOME="$_cudir" && export PATH="$_cudir/bin:$PATH" && break') - runner_lines.append(' done') + # so cmake's CUDA configure can find it — BUT only when actual NVIDIA + # hardware is present. On AMD/Intel hosts the pip nvcc is a misleading + # leftover (no libcudart, no GPU it could target) and would otherwise + # send the build down the CUDA branch and fail with "CUDA Toolkit not + # found" instead of trying Vulkan. + runner_lines.append(' _odysseus_has_nvidia_hw() {') + runner_lines.append(' command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi -L 2>/dev/null | grep -q "GPU " && return 0') + runner_lines.append(' ls /dev/nvidia* >/dev/null 2>&1 && return 0') + runner_lines.append(' lspci 2>/dev/null | grep -iE \'VGA|3D|Display\' | grep -iq nvidia && return 0') + runner_lines.append(' return 1') + runner_lines.append(' }') + runner_lines.append(' if _odysseus_has_nvidia_hw; then') + runner_lines.append(' for _cudir in ~/.local/lib/python*/site-packages/nvidia/cu13 ~/.local/lib/python*/site-packages/nvidia/cu12 ~/.local/lib/python*/site-packages/nvidia/cuda_nvcc; do') + runner_lines.append(' [ -x "$_cudir/bin/nvcc" ] && export CUDA_HOME="$_cudir" && export PATH="$_cudir/bin:$PATH" && break') + runner_lines.append(' done') + runner_lines.append(' fi') # rm -rf build so a prior poisoned CMakeCache.txt (e.g. from a failed CUDA # or HIP attempt) doesn't cause the next configure to reuse stale settings. runner_lines.append(' mkdir -p ~/bin') - runner_lines.append(' cd ~/llama.cpp && rm -rf build') + # Try to install cmake / build-essential / git automatically before the + # build, but ONLY via passwordless sudo (`sudo -n`) — interactive sudo + # would hang a tmux-backgrounded serve task waiting for a password. If + # sudo asks for a password the install is skipped silently and the + # diagnosis pattern (cookbook_routes.py / cookbook_helpers.py) surfaces + # an explicit "install cmake" suggestion in the Cookbook diagnosis + # toolbar after the inevitable build failure. + runner_lines.append(' _odysseus_apt_bootstrap() {') + runner_lines.append(' local _missing=""') + runner_lines.append(' command -v cmake >/dev/null 2>&1 || _missing="$_missing cmake"') + runner_lines.append(' command -v g++ >/dev/null 2>&1 || command -v gcc >/dev/null 2>&1 || _missing="$_missing build-essential"') + runner_lines.append(' command -v git >/dev/null 2>&1 || _missing="$_missing git"') + runner_lines.append(' [ -z "$_missing" ] && return 0') + runner_lines.append(' if command -v apt-get >/dev/null 2>&1 && sudo -n true 2>/dev/null; then') + runner_lines.append(' echo "[odysseus] Auto-installing missing build deps via apt:$_missing"') + runner_lines.append(' sudo -n env DEBIAN_FRONTEND=noninteractive apt-get update -qq 2>&1 | tail -3') + runner_lines.append(' sudo -n env DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends $_missing 2>&1 | tail -5 || true') + runner_lines.append(' elif command -v pacman >/dev/null 2>&1 && sudo -n true 2>/dev/null; then') + runner_lines.append(' echo "[odysseus] Auto-installing missing build deps via pacman:$_missing"') + runner_lines.append(' local _pacpkgs="$(echo "$_missing" | sed -e \'s/build-essential/base-devel/g\')"') + runner_lines.append(' sudo -n pacman -Sy --needed --noconfirm $_pacpkgs 2>&1 | tail -5 || true') + runner_lines.append(' elif command -v dnf >/dev/null 2>&1 && sudo -n true 2>/dev/null; then') + runner_lines.append(' echo "[odysseus] Auto-installing missing build deps via dnf:$_missing"') + runner_lines.append(' local _dnfpkgs="$(echo "$_missing" | sed -e \'s/build-essential/gcc gcc-c++ make/g\')"') + runner_lines.append(' sudo -n dnf install -y $_dnfpkgs 2>&1 | tail -5 || true') + runner_lines.append(' else') + runner_lines.append(' echo "[odysseus] WARNING: missing build deps ($_missing) — passwordless sudo is unavailable, cannot auto-install. Cookbook Diagnosis will explain the fix after the build fails."') + runner_lines.append(' fi') + runner_lines.append(' }') + runner_lines.append(' _odysseus_apt_bootstrap') + runner_lines.append(' _odysseus_missing_build_deps=""') + runner_lines.append(' command -v cmake >/dev/null 2>&1 || _odysseus_missing_build_deps="$_odysseus_missing_build_deps cmake"') + runner_lines.append(' command -v git >/dev/null 2>&1 || _odysseus_missing_build_deps="$_odysseus_missing_build_deps git"') + runner_lines.append(' command -v g++ >/dev/null 2>&1 || command -v gcc >/dev/null 2>&1 || _odysseus_missing_build_deps="$_odysseus_missing_build_deps build-essential"') + runner_lines.append(' if [ -n "$_odysseus_missing_build_deps" ]; then') + runner_lines.append(' echo "ERROR: llama.cpp source build needs missing packages:$_odysseus_missing_build_deps"') + runner_lines.append(' if command -v apt-get >/dev/null 2>&1; then') + runner_lines.append(' echo "Install on this host: sudo apt-get update && sudo apt-get install -y cmake build-essential git"') + runner_lines.append(' elif command -v pacman >/dev/null 2>&1; then') + runner_lines.append(' echo "Install on this host: sudo pacman -Sy --needed cmake base-devel git"') + runner_lines.append(' elif command -v dnf >/dev/null 2>&1; then') + runner_lines.append(' echo "Install on this host: sudo dnf install -y cmake gcc gcc-c++ make git"') + runner_lines.append(' fi') + runner_lines.append(' echo "Alternative: install a native llama-server on PATH, then relaunch."') + runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127') + runner_lines.append(' fi') + runner_lines.append(' cd ~/llama.cpp') + runner_lines.append(' _odysseus_has_vulkan() {') + runner_lines.append(' ldconfig -p 2>/dev/null | grep -q \'libvulkan\\.so\' && return 0') + runner_lines.append(' [ -e /usr/lib/libvulkan.so.1 ] && return 0') + runner_lines.append(' [ -e /usr/lib/x86_64-linux-gnu/libvulkan.so.1 ] && return 0') + runner_lines.append(' command -v vulkaninfo >/dev/null 2>&1 && return 0') + runner_lines.append(' return 1') + runner_lines.append(' }') + runner_lines.append(' _odysseus_has_vulkan_device() {') + runner_lines.append(' ls /dev/dri/renderD* >/dev/null 2>&1 && return 0') + runner_lines.append(' lspci 2>/dev/null | grep -Ei \'VGA|3D|Display\' | grep -Eiq \'AMD|ATI|Radeon\' && return 0') + runner_lines.append(' return 1') + runner_lines.append(' }') + # Backend preference: native ROCm/HIP > native CUDA > Vulkan > CPU. + # Vulkan is a portable fallback that works on AMD when ROCm isn't + # installed (e.g. Strix Halo) and on any vendor's discrete GPU, but + # it's ~30-40% slower than native HIP/CUDA for LLM inference — only + # pick it when no native toolchain is present. runner_lines.append(' if command -v hipconfig &>/dev/null || [ -d /opt/rocm ] || [ -n "$ROCM_PATH" ] || [ -n "$HIP_PATH" ]; then') + runner_lines.append(' rm -rf build') runner_lines.append(' if command -v hipconfig &>/dev/null; then') runner_lines.append(' export HIPCXX="${HIPCXX:-$(hipconfig -l)/clang}"') runner_lines.append(' export HIP_PATH="${HIP_PATH:-$(hipconfig -R)}"') runner_lines.append(' fi') runner_lines.append(' echo "[odysseus] ROCm/HIP detected — building llama-server with HIP support..."') runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_HIP=ON && cmake --build build -j"$NPROC" --target llama-server && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server') - runner_lines.append(' elif command -v nvcc &>/dev/null; then') + runner_lines.append(' elif command -v nvcc &>/dev/null && _odysseus_has_nvidia_hw; then') + runner_lines.append(' rm -rf build') # nvcc alone is not sufficient — pip-installed CUDA wheels or incomplete # tooling can expose nvcc without shipping libcudart, causing cmake to fail # mid-build with "CUDA runtime library not found". Check cudart explicitly @@ -826,18 +950,24 @@ def _append_llama_cpp_linux_accel_build_lines(runner_lines: list[str]) -> None: runner_lines.append(' echo "[odysseus] Ensure libcudart is installed (e.g. cuda-runtime package) and visible via ldconfig or CUDA_HOME."') runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release && cmake --build build -j"$NPROC" --target llama-server && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server') runner_lines.append(' fi') + runner_lines.append(' elif _odysseus_has_vulkan_device && _odysseus_has_vulkan; then') + runner_lines.append(' echo "[odysseus] Vulkan-capable GPU detected (no ROCm/CUDA toolchain installed) — building llama-server with Vulkan support..."') + runner_lines.append(' rm -rf build-vulkan') + runner_lines.append(' cmake -B build-vulkan -DCMAKE_BUILD_TYPE=Release -DGGML_VULKAN=ON && cmake --build build-vulkan -j"$NPROC" --target llama-server && ln -sf ~/llama.cpp/build-vulkan/bin/llama-server ~/bin/llama-server') runner_lines.append(' else') - runner_lines.append(' echo "[odysseus] WARNING: no HIP/CUDA toolchain found — building llama-server for CPU only."') + runner_lines.append(' echo "[odysseus] WARNING: no HIP/CUDA/Vulkan toolchain found — building llama-server for CPU only."') runner_lines.append(' echo "[odysseus] GPU inference will not be available for this llama.cpp build."') - runner_lines.append(' echo "[odysseus] Install ROCm for AMD GPUs or vLLM/CUDA tooling for NVIDIA, then re-launch this serve task."') + runner_lines.append(' echo "[odysseus] Install Vulkan (libvulkan-dev) / ROCm for AMD GPUs or CUDA tooling for NVIDIA, then re-launch this serve task."') + runner_lines.append(' rm -rf build') runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release && cmake --build build -j"$NPROC" --target llama-server && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server') runner_lines.append(' fi') + runner_lines.append(' fi # end _odysseus_have_prebuilt guard') def _llama_cpp_rebuild_cmd() -> str: """Shell command that clears the Cookbook-managed llama.cpp build. - Removes the cached ``llama-server`` symlink and the ``~/llama.cpp/build`` + Removes the cached ``llama-server`` symlink and the ``~/llama.cpp/build*`` directory so the next llama.cpp serve recompiles from source, picking up a CUDA or HIP toolchain if one is now available. The serve bootstrap only builds when ``llama-server`` is missing from PATH, so without this an @@ -847,10 +977,10 @@ def _llama_cpp_rebuild_cmd() -> str: return ( 'mkdir -p "$HOME/bin" && ' 'rm -f "$HOME/bin/llama-server" && ' - 'rm -rf "$HOME/llama.cpp/build" && ' + 'rm -rf "$HOME/llama.cpp/build" "$HOME/llama.cpp/build-vulkan" && ' 'echo "[odysseus] Cleared the cached llama.cpp build. ' 'Re-launch the serve task to rebuild llama-server from source ' - '(CUDA or HIP will be used if a toolchain is now available)."' + '(Vulkan, HIP, or CUDA will be used if a matching toolchain is now available)."' ) @@ -1113,8 +1243,27 @@ def _diagnose_serve_output(text: str) -> dict | None: "SGLang is not installed or not in PATH on this server.", [{"label": "install SGLang in Cookbook Dependencies", "op": "dependency", "package": "sglang[all]"}], ), + # System build deps come BEFORE the generic llama.cpp catch-all so + # cmake / build-essential / git missing → a specific OS-package + # remediation instead of "install llama-cpp-python[server]" (which + # itself fails to compile when cmake is absent). ( - r"llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'|git: command not found|cmake: command not found", + r"cmake: command not found|cmake.*not found.*[Cc]ould not", + "cmake is required to build llama.cpp from source but isn't installed on this server.", + [{"label": "install build deps for llama.cpp (apt: cmake build-essential git / pacman: cmake base-devel git / dnf: cmake gcc-c++ make git / brew: cmake git)", "op": "dependency", "package": "llama-cpp-python[server]"}], + ), + ( + r"^(make|g\+\+|gcc): command not found|Could not find C\+\+ compiler", + "A C/C++ compiler (build-essential) is required to build llama.cpp from source.", + [{"label": "install build deps for llama.cpp on this server", "op": "dependency", "package": "llama-cpp-python[server]"}], + ), + ( + r"^git: command not found", + "git is required to clone the llama.cpp source tree.", + [{"label": "install build deps for llama.cpp on this server", "op": "dependency", "package": "llama-cpp-python[server]"}], + ), + ( + r"llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'", "llama.cpp / llama-cpp-python dependencies are missing.", [{"label": "install llama.cpp dependencies or llama-cpp-python[server]", "op": "dependency", "package": "llama-cpp-python[server]"}], ), diff --git a/routes/cookbook_routes.py b/routes/cookbook_routes.py index af25dd8e8..0bd38d19f 100644 --- a/routes/cookbook_routes.py +++ b/routes/cookbook_routes.py @@ -189,8 +189,27 @@ def setup_cookbook_routes() -> APIRouter: "SGLang is not installed or not in PATH on this server.", [{"label": "install SGLang in Cookbook Dependencies", "op": "dependency", "package": "sglang[all]"}], ), + # System build deps come BEFORE the generic llama.cpp catch-all + # so cmake / build-essential / git missing → a specific OS-package + # remediation instead of "install llama-cpp-python[server]" (which + # itself fails to compile when cmake is absent). ( - r"llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'|git: command not found|cmake: command not found", + r"cmake: command not found|cmake.*not found.*[Cc]ould not", + "cmake is required to build llama.cpp from source but isn't installed on this server.", + [{"label": "install build deps for llama.cpp (apt: cmake build-essential git / pacman: cmake base-devel git / dnf: cmake gcc-c++ make git / brew: cmake git)", "op": "dependency", "package": "llama-cpp-python[server]"}], + ), + ( + r"^(make|g\+\+|gcc): command not found|Could not find C\+\+ compiler", + "A C/C++ compiler (build-essential) is required to build llama.cpp from source.", + [{"label": "install build deps for llama.cpp on this server", "op": "dependency", "package": "llama-cpp-python[server]"}], + ), + ( + r"^git: command not found", + "git is required to clone the llama.cpp source tree.", + [{"label": "install build deps for llama.cpp on this server", "op": "dependency", "package": "llama-cpp-python[server]"}], + ), + ( + r"llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'", "llama.cpp / llama-cpp-python dependencies are missing.", [{"label": "install llama.cpp dependencies or llama-cpp-python[server]", "op": "dependency", "package": "llama-cpp-python[server]"}], ), @@ -1243,8 +1262,16 @@ def setup_cookbook_routes() -> APIRouter: req.cmd = _pip_install_no_cache(req.cmd) # Accept common aliases and enforce server extras for llama-cpp so # `python -m llama_cpp.server` has all runtime dependencies. - req.cmd = re.sub(r"(? APIRouter: runner_lines.append(' else') _append_llama_cpp_linux_accel_build_lines(runner_lines) runner_lines.append(' fi') + # Source the env file the prebuilt-download path writes so + # LD_LIBRARY_PATH includes the directory holding libllama.so + # and friends. No-op when prebuilt wasn't used. + runner_lines.append(' [ -r ~/.config/odysseus-llama-cpp-env ] && . ~/.config/odysseus-llama-cpp-env') + # Auto-upgrade pip llama-cpp-python to the CUDA-enabled + # wheel when (a) NVIDIA hardware is present and (b) the + # currently-installed wheel is CPU-only. Without this the + # user gets the Python server happily running at 3 tok/s + # because pip's default index ships CPU-only wheels. + # Forward-compat: cu124 wheels work on driver/runtime + # 12.4+ including the cu13.x line. + runner_lines.append(' if command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi -L 2>/dev/null | grep -q "GPU " && python3 -c "import llama_cpp" 2>/dev/null; then') + runner_lines.append(' if ! python3 -c "import llama_cpp; import sys; sys.exit(0 if llama_cpp.llama_supports_gpu_offload() else 1)" 2>/dev/null; then') + runner_lines.append(' echo "[odysseus] NVIDIA detected but installed llama-cpp-python is CPU-only — reinstalling with CUDA wheel index for GPU offload..."') + runner_lines.append(' python3 -m pip install --user --break-system-packages --force-reinstall --no-cache-dir "llama-cpp-python[server]" --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124 2>&1 | tail -8 || echo "[odysseus] WARNING: CUDA wheel reinstall failed — Python server will stay CPU-only (slow). Manual fix: pip install --user --force-reinstall \'llama-cpp-python[server]\' --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124"') + runner_lines.append(' if python3 -c "import llama_cpp; import sys; sys.exit(0 if llama_cpp.llama_supports_gpu_offload() else 1)" 2>/dev/null; then') + runner_lines.append(' echo "[odysseus] llama-cpp-python now supports GPU offload."') + runner_lines.append(' fi') + runner_lines.append(' fi') + runner_lines.append(' fi') + # SHORT-CIRCUIT before the build/pip fallback: if the + # native binary is missing but llama_cpp Python is already + # installed, drop a wrapper at ~/bin/llama-server that + # translates llama-server CLI args to llama_cpp.server's + # underscore-style flags. The user's serve command stays + # `llama-server ...` and "just works" — no build, no cmake, + # no second install. This is the path that unblocks every + # remote where pip-installed llama-cpp-python is already + # working but Cookbook used to insist on a native binary. + runner_lines.append(' if ! command -v llama-server >/dev/null 2>&1 && python3 -c "import llama_cpp" 2>/dev/null; then') + runner_lines.append(' mkdir -p ~/bin') + runner_lines.append(' cat > ~/bin/llama-server <<\'_ODY_LLAMA_SHIM_EOF\'') + runner_lines.append('#!/usr/bin/env bash') + runner_lines.append('# Auto-generated by Odysseus Cookbook: a `llama-server` lookalike') + runner_lines.append('# that translates the native CLI to `python -m llama_cpp.server`.') + runner_lines.append('# Lets cookbook-generated launch commands run unchanged on hosts') + runner_lines.append('# where only the pip llama-cpp-python package is installed.') + runner_lines.append('ARGS=()') + runner_lines.append('while [ $# -gt 0 ]; do') + runner_lines.append(' case "$1" in') + runner_lines.append(' -ngl|--gpu-layers|--n-gpu-layers) ARGS+=(--n_gpu_layers "$2"); shift 2 ;;') + runner_lines.append(' -c|--ctx-size) ARGS+=(--n_ctx "$2"); shift 2 ;;') + runner_lines.append(' -b|--batch-size) ARGS+=(--n_batch "$2"); shift 2 ;;') + runner_lines.append(' -ub|--ubatch-size) shift 2 ;; # llama-cpp-python has no separate ubatch') + runner_lines.append(' --flash-attn) ARGS+=(--flash_attn true); shift 2 ;;') + runner_lines.append(' --cache-type-k) ARGS+=(--type_k "$2"); shift 2 ;;') + runner_lines.append(' --cache-type-v) ARGS+=(--type_v "$2"); shift 2 ;;') + runner_lines.append(' --n-cpu-moe) ARGS+=(--n_cpu_moe "$2"); shift 2 ;;') + runner_lines.append(' --mmproj) ARGS+=(--clip_model_path "$2"); shift 2 ;;') + runner_lines.append(' --image-max-tokens) shift 2 ;; # native-only') + runner_lines.append(' --no-mmap) ARGS+=(--no_mmap true); shift ;;') + runner_lines.append(' --no-warmup) shift ;; # native-only') + runner_lines.append(' --chat-template) ARGS+=(--chat_format "$2"); shift 2 ;;') + runner_lines.append(' --fit|--split-mode|--tensor-split|--main-gpu|--parallel) shift 2 ;; # native-only') + runner_lines.append(' --mlock) ARGS+=(--use_mlock true); shift ;;') + runner_lines.append(' *) ARGS+=("$1"); shift ;;') + runner_lines.append(' esac') + runner_lines.append('done') + runner_lines.append('exec python3 -m llama_cpp.server "${ARGS[@]}"') + runner_lines.append('_ODY_LLAMA_SHIM_EOF') + runner_lines.append(' chmod +x ~/bin/llama-server') + runner_lines.append(' echo "[odysseus] Created llama-server shim → python -m llama_cpp.server (no native binary needed)"') + runner_lines.append(' fi') runner_lines.append(' # If the native build failed, fall back to the Python bindings.') runner_lines.append(' if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then') runner_lines.append(' echo "llama-server build failed — installing Python bindings as fallback..."') @@ -1834,6 +1924,25 @@ def setup_cookbook_routes() -> APIRouter: out, err = await _run_gpu_shell("ls -1 /sys/class/drm 2>/dev/null", host, ssh_port, timeout=4) if err is not None or not out: return [] + # Pick the runtime label up-front so each GPU dict gets the + # right `backend`. AMD silicon can be driven by ROCm/HIP (native) + # OR Vulkan (mesa RADV). Reporting "rocm" on a host where no + # ROCm toolchain is installed misleads the frontend env-var + # prefix logic — it would emit `HIP_VISIBLE_DEVICES=` for a + # Vulkan-only stack, which is a silent no-op at best. + rt_out, _ = await _run_gpu_shell( + 'command -v rocminfo >/dev/null 2>&1 && echo rocm ' + '|| (command -v hipconfig >/dev/null 2>&1 && echo rocm) ' + '|| (command -v vulkaninfo >/dev/null 2>&1 && echo vulkan) ' + '|| echo unknown', + host, ssh_port, timeout=4, + ) + _amd_runtime = (rt_out or "").strip().splitlines()[-1:][0].strip() if rt_out else "rocm" + if _amd_runtime not in ("rocm", "vulkan"): + # Default to rocm so existing ROCm-installed hosts keep + # working; "unknown" only happens when neither toolchain is + # detected (e.g. minimal sysfs read on a fresh box). + _amd_runtime = "rocm" gpus = [] for entry in out.split(): if not entry.startswith("card") or "-" in entry: @@ -1877,7 +1986,7 @@ def setup_cookbook_routes() -> APIRouter: "free_mb": free_mb, "total_mb": total_mb, "used_mb": used_mb, "gtt_used_mb": gtt_used_mb, "util_pct": 0, "busy": bool(total_mb and (free_mb / total_mb) < 0.85), - "processes": [], "backend": "rocm", "source": "amd-sysfs", + "processes": [], "backend": _amd_runtime, "source": "amd-sysfs", "unified_memory": unified, }) if gpus: @@ -2018,10 +2127,15 @@ def setup_cookbook_routes() -> APIRouter: amd_gpus = await _probe_amd_sysfs(host, ssh_port) if amd_gpus: + # The per-GPU dict already carries the runtime label picked by + # _probe_amd_sysfs (rocm vs vulkan); mirror that into the + # wrapper so the frontend can read `data.backend` directly + # without scanning the list. + _amd_wrap_backend = str(amd_gpus[0].get("backend") or "rocm") return { "ok": True, "gpus": amd_gpus, - "backend": "rocm", + "backend": _amd_wrap_backend, "source": "amd-sysfs", "fallback_from": "nvidia-smi", "nvidia_error": nvidia_error, diff --git a/services/hwfit/hardware.py b/services/hwfit/hardware.py index a3ad7ba05..0473475ed 100644 --- a/services/hwfit/hardware.py +++ b/services/hwfit/hardware.py @@ -282,7 +282,17 @@ def _detect_amd(): "gpus": cards, "gpu_groups": groups, "homogeneous": len(groups) <= 1, - "backend": "rocm", + # Pick the actual runtime label: ROCm/HIP only when its + # toolchain is installed, otherwise Vulkan if vulkaninfo is + # present (mesa RADV works fine on RDNA/CDNA when ROCm + # packages are absent — see Strix Halo where ROCm support + # is still backporting). Reporting "rocm" on a Vulkan-only + # host misleads downstream env-var pinning + # (HIP_VISIBLE_DEVICES is a no-op there). + "backend": ( + "rocm" if (_run(["which", "rocminfo"]) or _run(["which", "hipconfig"])) + else ("vulkan" if _run(["which", "vulkaninfo"]) else "rocm") + ), "unified_memory": is_apu, # AMD ISA/family so downstream can tell datacenter Instinct (CDNA, # where vLLM/SGLang run AWQ/GPTQ reliably) from consumer Radeon