From bd2fa82c1eb542a179a7ba7415e241b37ffea277 Mon Sep 17 00:00:00 2001 From: ooovenenoso <120500656+ooovenenoso@users.noreply.github.com> Date: Tue, 2 Jun 2026 07:59:44 -0400 Subject: [PATCH] Cookbook: prefer ROCm for native llama.cpp bootstrap Co-authored-by: Kevin <120500656+oooindefatigable@users.noreply.github.com> --- routes/cookbook_helpers.py | 41 +++++++++++++++++++++++++++++++++ routes/cookbook_routes.py | 42 +++------------------------------- tests/test_cookbook_helpers.py | 21 +++++++++++++++++ 3 files changed, 65 insertions(+), 39 deletions(-) diff --git a/routes/cookbook_helpers.py b/routes/cookbook_helpers.py index 9e9e8ce11..4589c5aa2 100644 --- a/routes/cookbook_helpers.py +++ b/routes/cookbook_helpers.py @@ -487,8 +487,49 @@ def _append_serve_exit_code_lines(runner_lines: list[str], *, keep_shell_open: b runner_lines.append('echo ""; echo "=== Process exited with code $ODYSSEUS_CMD_EXIT ==="; exec "${SHELL:-/bin/bash}"') else: runner_lines.append('echo ""; echo "=== Process exited with code $ODYSSEUS_CMD_EXIT ==="') + runner_lines.append('exit "$ODYSSEUS_CMD_EXIT"') +def _append_llama_cpp_linux_accel_build_lines(runner_lines: list[str]) -> None: + """Append Linux llama.cpp build lines that prefer ROCm/HIP when available. + + Cookbook already detects AMD GPUs elsewhere, but the llama.cpp bootstrap used + to hard-wire CUDA on Linux. That made ROCm hosts attempt a CUDA configure and + fail with "CUDA Toolkit not found" instead of building with HIP. + """ + # Detect pip-installed nvcc (from vLLM/nvidia CUDA wheels) and put it on PATH + # so cmake's CUDA configure can find it. We keep this after the ROCm/HIP + # check — a machine with both stacks should honor the native HIP toolchain on + # AMD hosts instead of accidentally preferring a stray nvcc wheel. + runner_lines.append(' for _cudir in ~/.local/lib/python*/site-packages/nvidia/cu13 ~/.local/lib/python*/site-packages/nvidia/cu12 ~/.local/lib/python*/site-packages/nvidia/cuda_nvcc; do') + runner_lines.append(' [ -x "$_cudir/bin/nvcc" ] && export CUDA_HOME="$_cudir" && export PATH="$_cudir/bin:$PATH" && break') + runner_lines.append(' done') + # rm -rf build so a prior poisoned CMakeCache.txt (e.g. from a failed CUDA + # or HIP attempt) doesn't cause the next configure to reuse stale settings. + runner_lines.append(' cd ~/llama.cpp && rm -rf build') + runner_lines.append(' if command -v hipconfig &>/dev/null || [ -d /opt/rocm ] || [ -n "$ROCM_PATH" ] || [ -n "$HIP_PATH" ]; then') + runner_lines.append(' if command -v hipconfig &>/dev/null; then') + runner_lines.append(' export HIPCXX="${HIPCXX:-$(hipconfig -l)/clang}"') + runner_lines.append(' export HIP_PATH="${HIP_PATH:-$(hipconfig -R)}"') + runner_lines.append(' fi') + runner_lines.append(' echo "[odysseus] ROCm/HIP detected — building llama-server with HIP support..."') + runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_HIP=ON \\\\') + runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\\\') + runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server') + runner_lines.append(' elif command -v nvcc &>/dev/null; then') + runner_lines.append(' echo "[odysseus] CUDA nvcc found — building llama-server with CUDA (GPU) support..."') + runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON \\\\') + runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\\\') + runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server') + runner_lines.append(' else') + runner_lines.append(' echo "[odysseus] WARNING: no HIP/CUDA toolchain found — building llama-server for CPU only."') + runner_lines.append(' echo "[odysseus] GPU inference will not be available for this llama.cpp build."') + runner_lines.append(' echo "[odysseus] Install ROCm for AMD GPUs or vLLM/CUDA tooling for NVIDIA, then re-launch this serve task."') + runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release \\\\') + runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\\\') + runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server') + runner_lines.append(' fi') + class ModelDownloadRequest(BaseModel): repo_id: str include: str | None = None # glob pattern e.g. "*Q4_K_M*" diff --git a/routes/cookbook_routes.py b/routes/cookbook_routes.py index 786f117d6..abc5927c0 100644 --- a/routes/cookbook_routes.py +++ b/routes/cookbook_routes.py @@ -37,8 +37,8 @@ from routes.cookbook_helpers import ( _validate_local_dir, _validate_ssh_port, _validate_gpus, _shell_path, _ps_squote, _bash_squote, _validate_serve_cmd, _parse_serve_phase, _safe_env_prefix, _local_tooling_path_export, _append_serve_preflight_exit_lines, - _append_serve_exit_code_lines, _cached_model_scan_script, _ollama_bind_from_cmd, - _pip_install_fallback_chain, ModelDownloadRequest, ServeRequest, + _append_serve_exit_code_lines, _append_llama_cpp_linux_accel_build_lines, _cached_model_scan_script, + _ollama_bind_from_cmd, _pip_install_fallback_chain, ModelDownloadRequest, ServeRequest, ) _HF_TOKEN_STATUS_SNIPPET = ( @@ -963,43 +963,7 @@ def setup_cookbook_routes() -> APIRouter: runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\') runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server') runner_lines.append(' else') - # Detect pip-installed nvcc (from vLLM/nvidia CUDA wheels) and put - # it on PATH so cmake's CUDA configure can find it. We check the - # same three layouts as entrypoint.sh: - # nvidia/cu13 — nvidia-nvcc-cu13 - # nvidia/cu12 — nvidia-nvcc-cu12 - # nvidia/cuda_nvcc — nvidia-cuda-nvcc-cu12 (sub-package style) - runner_lines.append(' for _cudir in ~/.local/lib/python*/site-packages/nvidia/cu13 ~/.local/lib/python*/site-packages/nvidia/cu12 ~/.local/lib/python*/site-packages/nvidia/cuda_nvcc; do') - runner_lines.append(' [ -x "$_cudir/bin/nvcc" ] && export CUDA_HOME="$_cudir" && export PATH="$_cudir/bin:$PATH" && break') - runner_lines.append(' done') - # rm -rf build so a prior poisoned CMakeCache.txt (e.g. from a - # failed CUDA attempt) doesn't cause the next configure to reuse - # stale settings and silently produce a CPU-only binary. - runner_lines.append(' cd ~/llama.cpp && rm -rf build') - runner_lines.append(' _ody_has_cuda_runtime=0') - runner_lines.append(' if command -v nvcc &>/dev/null; then') - runner_lines.append(' for _cudalib in "${CUDA_HOME:-}/lib64"/libcudart.so* "${CUDA_HOME:-}/lib"/libcudart.so* /usr/local/cuda/lib64/libcudart.so* /usr/lib*/libcudart.so*; do') - runner_lines.append(' [ -e "$_cudalib" ] && _ody_has_cuda_runtime=1 && break') - runner_lines.append(' done') - runner_lines.append(' fi') - runner_lines.append(' if command -v nvcc &>/dev/null && [ "$_ody_has_cuda_runtime" = "1" ]; then') - runner_lines.append(' echo "[odysseus] CUDA nvcc found — building llama-server with CUDA (GPU) support..."') - runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON \\') - runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\') - runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server') - runner_lines.append(' else') - runner_lines.append(' if command -v nvcc &>/dev/null; then') - runner_lines.append(' echo "[odysseus] WARNING: nvcc found but CUDA runtime library was not found — building llama-server for CPU only."') - runner_lines.append(' else') - runner_lines.append(' echo "[odysseus] WARNING: nvcc not found — building llama-server for CPU only."') - runner_lines.append(' fi') - runner_lines.append(' echo "[odysseus] GPU inference will not be available for this llama.cpp build."') - runner_lines.append(' echo "[odysseus] To get a GPU build, first install vLLM via Cookbook -> Dependencies"') - runner_lines.append(' echo "[odysseus] (its CUDA wheels include nvcc), then re-launch this serve task."') - runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release \\') - runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\') - runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server') - runner_lines.append(' fi') + _append_llama_cpp_linux_accel_build_lines(runner_lines) runner_lines.append(' fi') runner_lines.append(' # If the native build failed, fall back to the Python bindings.') runner_lines.append(' if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then') diff --git a/tests/test_cookbook_helpers.py b/tests/test_cookbook_helpers.py index 9d88b4024..d19c36eae 100644 --- a/tests/test_cookbook_helpers.py +++ b/tests/test_cookbook_helpers.py @@ -7,6 +7,7 @@ from fastapi import HTTPException from routes.cookbook_helpers import ( _cached_model_scan_script, + _append_llama_cpp_linux_accel_build_lines, _append_serve_exit_code_lines, _append_serve_preflight_exit_lines, _local_tooling_path_export, @@ -296,6 +297,26 @@ def test_ollama_serve_rejects_unsafe_bind_values(): ) +def test_llama_cpp_linux_bootstrap_prefers_rocm_before_cuda(): + runner_lines = [] + _append_llama_cpp_linux_accel_build_lines(runner_lines) + script = "\n".join(runner_lines) + + assert 'command -v hipconfig &>/dev/null || [ -d /opt/rocm ] || [ -n "$ROCM_PATH" ] || [ -n "$HIP_PATH" ]' in script + assert 'cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_HIP=ON' in script + assert 'cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON' in script + assert script.index('DGGML_HIP=ON') < script.index('DGGML_CUDA=ON') + assert 'ROCm/HIP detected — building llama-server with HIP support' in script + + +def test_llama_cpp_linux_bootstrap_keeps_cpu_fallback_when_no_gpu_toolchain(): + runner_lines = [] + _append_llama_cpp_linux_accel_build_lines(runner_lines) + script = "\n".join(runner_lines) + + assert 'WARNING: no HIP/CUDA toolchain found — building llama-server for CPU only.' in script + assert 'Install ROCm for AMD GPUs or vLLM/CUDA tooling for NVIDIA' in script + def test_cached_model_scan_reports_plain_dir_gguf(tmp_path): """Custom download dirs may sit inside the HF hub cache and contain plain per-model folders. They must show up in Serve and keep the GGUF signal."""