mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-17 10:15:27 -04:00
Cookbook: prefer ROCm for native llama.cpp bootstrap
Co-authored-by: Kevin <120500656+oooindefatigable@users.noreply.github.com>
This commit is contained in:
@@ -487,8 +487,49 @@ def _append_serve_exit_code_lines(runner_lines: list[str], *, keep_shell_open: b
|
|||||||
runner_lines.append('echo ""; echo "=== Process exited with code $ODYSSEUS_CMD_EXIT ==="; exec "${SHELL:-/bin/bash}"')
|
runner_lines.append('echo ""; echo "=== Process exited with code $ODYSSEUS_CMD_EXIT ==="; exec "${SHELL:-/bin/bash}"')
|
||||||
else:
|
else:
|
||||||
runner_lines.append('echo ""; echo "=== Process exited with code $ODYSSEUS_CMD_EXIT ==="')
|
runner_lines.append('echo ""; echo "=== Process exited with code $ODYSSEUS_CMD_EXIT ==="')
|
||||||
|
runner_lines.append('exit "$ODYSSEUS_CMD_EXIT"')
|
||||||
|
|
||||||
|
|
||||||
|
def _append_llama_cpp_linux_accel_build_lines(runner_lines: list[str]) -> None:
|
||||||
|
"""Append Linux llama.cpp build lines that prefer ROCm/HIP when available.
|
||||||
|
|
||||||
|
Cookbook already detects AMD GPUs elsewhere, but the llama.cpp bootstrap used
|
||||||
|
to hard-wire CUDA on Linux. That made ROCm hosts attempt a CUDA configure and
|
||||||
|
fail with "CUDA Toolkit not found" instead of building with HIP.
|
||||||
|
"""
|
||||||
|
# Detect pip-installed nvcc (from vLLM/nvidia CUDA wheels) and put it on PATH
|
||||||
|
# so cmake's CUDA configure can find it. We keep this after the ROCm/HIP
|
||||||
|
# check — a machine with both stacks should honor the native HIP toolchain on
|
||||||
|
# AMD hosts instead of accidentally preferring a stray nvcc wheel.
|
||||||
|
runner_lines.append(' for _cudir in ~/.local/lib/python*/site-packages/nvidia/cu13 ~/.local/lib/python*/site-packages/nvidia/cu12 ~/.local/lib/python*/site-packages/nvidia/cuda_nvcc; do')
|
||||||
|
runner_lines.append(' [ -x "$_cudir/bin/nvcc" ] && export CUDA_HOME="$_cudir" && export PATH="$_cudir/bin:$PATH" && break')
|
||||||
|
runner_lines.append(' done')
|
||||||
|
# rm -rf build so a prior poisoned CMakeCache.txt (e.g. from a failed CUDA
|
||||||
|
# or HIP attempt) doesn't cause the next configure to reuse stale settings.
|
||||||
|
runner_lines.append(' cd ~/llama.cpp && rm -rf build')
|
||||||
|
runner_lines.append(' if command -v hipconfig &>/dev/null || [ -d /opt/rocm ] || [ -n "$ROCM_PATH" ] || [ -n "$HIP_PATH" ]; then')
|
||||||
|
runner_lines.append(' if command -v hipconfig &>/dev/null; then')
|
||||||
|
runner_lines.append(' export HIPCXX="${HIPCXX:-$(hipconfig -l)/clang}"')
|
||||||
|
runner_lines.append(' export HIP_PATH="${HIP_PATH:-$(hipconfig -R)}"')
|
||||||
|
runner_lines.append(' fi')
|
||||||
|
runner_lines.append(' echo "[odysseus] ROCm/HIP detected — building llama-server with HIP support..."')
|
||||||
|
runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_HIP=ON \\\\')
|
||||||
|
runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\\\')
|
||||||
|
runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
|
||||||
|
runner_lines.append(' elif command -v nvcc &>/dev/null; then')
|
||||||
|
runner_lines.append(' echo "[odysseus] CUDA nvcc found — building llama-server with CUDA (GPU) support..."')
|
||||||
|
runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON \\\\')
|
||||||
|
runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\\\')
|
||||||
|
runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
|
||||||
|
runner_lines.append(' else')
|
||||||
|
runner_lines.append(' echo "[odysseus] WARNING: no HIP/CUDA toolchain found — building llama-server for CPU only."')
|
||||||
|
runner_lines.append(' echo "[odysseus] GPU inference will not be available for this llama.cpp build."')
|
||||||
|
runner_lines.append(' echo "[odysseus] Install ROCm for AMD GPUs or vLLM/CUDA tooling for NVIDIA, then re-launch this serve task."')
|
||||||
|
runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release \\\\')
|
||||||
|
runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\\\')
|
||||||
|
runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
|
||||||
|
runner_lines.append(' fi')
|
||||||
|
|
||||||
class ModelDownloadRequest(BaseModel):
|
class ModelDownloadRequest(BaseModel):
|
||||||
repo_id: str
|
repo_id: str
|
||||||
include: str | None = None # glob pattern e.g. "*Q4_K_M*"
|
include: str | None = None # glob pattern e.g. "*Q4_K_M*"
|
||||||
|
|||||||
@@ -37,8 +37,8 @@ from routes.cookbook_helpers import (
|
|||||||
_validate_local_dir, _validate_ssh_port, _validate_gpus, _shell_path,
|
_validate_local_dir, _validate_ssh_port, _validate_gpus, _shell_path,
|
||||||
_ps_squote, _bash_squote, _validate_serve_cmd, _parse_serve_phase,
|
_ps_squote, _bash_squote, _validate_serve_cmd, _parse_serve_phase,
|
||||||
_safe_env_prefix, _local_tooling_path_export, _append_serve_preflight_exit_lines,
|
_safe_env_prefix, _local_tooling_path_export, _append_serve_preflight_exit_lines,
|
||||||
_append_serve_exit_code_lines, _cached_model_scan_script, _ollama_bind_from_cmd,
|
_append_serve_exit_code_lines, _append_llama_cpp_linux_accel_build_lines, _cached_model_scan_script,
|
||||||
_pip_install_fallback_chain, ModelDownloadRequest, ServeRequest,
|
_ollama_bind_from_cmd, _pip_install_fallback_chain, ModelDownloadRequest, ServeRequest,
|
||||||
)
|
)
|
||||||
|
|
||||||
_HF_TOKEN_STATUS_SNIPPET = (
|
_HF_TOKEN_STATUS_SNIPPET = (
|
||||||
@@ -963,43 +963,7 @@ def setup_cookbook_routes() -> APIRouter:
|
|||||||
runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\')
|
runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\')
|
||||||
runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
|
runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
|
||||||
runner_lines.append(' else')
|
runner_lines.append(' else')
|
||||||
# Detect pip-installed nvcc (from vLLM/nvidia CUDA wheels) and put
|
_append_llama_cpp_linux_accel_build_lines(runner_lines)
|
||||||
# it on PATH so cmake's CUDA configure can find it. We check the
|
|
||||||
# same three layouts as entrypoint.sh:
|
|
||||||
# nvidia/cu13 — nvidia-nvcc-cu13
|
|
||||||
# nvidia/cu12 — nvidia-nvcc-cu12
|
|
||||||
# nvidia/cuda_nvcc — nvidia-cuda-nvcc-cu12 (sub-package style)
|
|
||||||
runner_lines.append(' for _cudir in ~/.local/lib/python*/site-packages/nvidia/cu13 ~/.local/lib/python*/site-packages/nvidia/cu12 ~/.local/lib/python*/site-packages/nvidia/cuda_nvcc; do')
|
|
||||||
runner_lines.append(' [ -x "$_cudir/bin/nvcc" ] && export CUDA_HOME="$_cudir" && export PATH="$_cudir/bin:$PATH" && break')
|
|
||||||
runner_lines.append(' done')
|
|
||||||
# rm -rf build so a prior poisoned CMakeCache.txt (e.g. from a
|
|
||||||
# failed CUDA attempt) doesn't cause the next configure to reuse
|
|
||||||
# stale settings and silently produce a CPU-only binary.
|
|
||||||
runner_lines.append(' cd ~/llama.cpp && rm -rf build')
|
|
||||||
runner_lines.append(' _ody_has_cuda_runtime=0')
|
|
||||||
runner_lines.append(' if command -v nvcc &>/dev/null; then')
|
|
||||||
runner_lines.append(' for _cudalib in "${CUDA_HOME:-}/lib64"/libcudart.so* "${CUDA_HOME:-}/lib"/libcudart.so* /usr/local/cuda/lib64/libcudart.so* /usr/lib*/libcudart.so*; do')
|
|
||||||
runner_lines.append(' [ -e "$_cudalib" ] && _ody_has_cuda_runtime=1 && break')
|
|
||||||
runner_lines.append(' done')
|
|
||||||
runner_lines.append(' fi')
|
|
||||||
runner_lines.append(' if command -v nvcc &>/dev/null && [ "$_ody_has_cuda_runtime" = "1" ]; then')
|
|
||||||
runner_lines.append(' echo "[odysseus] CUDA nvcc found — building llama-server with CUDA (GPU) support..."')
|
|
||||||
runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON \\')
|
|
||||||
runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\')
|
|
||||||
runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
|
|
||||||
runner_lines.append(' else')
|
|
||||||
runner_lines.append(' if command -v nvcc &>/dev/null; then')
|
|
||||||
runner_lines.append(' echo "[odysseus] WARNING: nvcc found but CUDA runtime library was not found — building llama-server for CPU only."')
|
|
||||||
runner_lines.append(' else')
|
|
||||||
runner_lines.append(' echo "[odysseus] WARNING: nvcc not found — building llama-server for CPU only."')
|
|
||||||
runner_lines.append(' fi')
|
|
||||||
runner_lines.append(' echo "[odysseus] GPU inference will not be available for this llama.cpp build."')
|
|
||||||
runner_lines.append(' echo "[odysseus] To get a GPU build, first install vLLM via Cookbook -> Dependencies"')
|
|
||||||
runner_lines.append(' echo "[odysseus] (its CUDA wheels include nvcc), then re-launch this serve task."')
|
|
||||||
runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release \\')
|
|
||||||
runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\')
|
|
||||||
runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
|
|
||||||
runner_lines.append(' fi')
|
|
||||||
runner_lines.append(' fi')
|
runner_lines.append(' fi')
|
||||||
runner_lines.append(' # If the native build failed, fall back to the Python bindings.')
|
runner_lines.append(' # If the native build failed, fall back to the Python bindings.')
|
||||||
runner_lines.append(' if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then')
|
runner_lines.append(' if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then')
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ from fastapi import HTTPException
|
|||||||
|
|
||||||
from routes.cookbook_helpers import (
|
from routes.cookbook_helpers import (
|
||||||
_cached_model_scan_script,
|
_cached_model_scan_script,
|
||||||
|
_append_llama_cpp_linux_accel_build_lines,
|
||||||
_append_serve_exit_code_lines,
|
_append_serve_exit_code_lines,
|
||||||
_append_serve_preflight_exit_lines,
|
_append_serve_preflight_exit_lines,
|
||||||
_local_tooling_path_export,
|
_local_tooling_path_export,
|
||||||
@@ -296,6 +297,26 @@ def test_ollama_serve_rejects_unsafe_bind_values():
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_llama_cpp_linux_bootstrap_prefers_rocm_before_cuda():
|
||||||
|
runner_lines = []
|
||||||
|
_append_llama_cpp_linux_accel_build_lines(runner_lines)
|
||||||
|
script = "\n".join(runner_lines)
|
||||||
|
|
||||||
|
assert 'command -v hipconfig &>/dev/null || [ -d /opt/rocm ] || [ -n "$ROCM_PATH" ] || [ -n "$HIP_PATH" ]' in script
|
||||||
|
assert 'cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_HIP=ON' in script
|
||||||
|
assert 'cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON' in script
|
||||||
|
assert script.index('DGGML_HIP=ON') < script.index('DGGML_CUDA=ON')
|
||||||
|
assert 'ROCm/HIP detected — building llama-server with HIP support' in script
|
||||||
|
|
||||||
|
|
||||||
|
def test_llama_cpp_linux_bootstrap_keeps_cpu_fallback_when_no_gpu_toolchain():
|
||||||
|
runner_lines = []
|
||||||
|
_append_llama_cpp_linux_accel_build_lines(runner_lines)
|
||||||
|
script = "\n".join(runner_lines)
|
||||||
|
|
||||||
|
assert 'WARNING: no HIP/CUDA toolchain found — building llama-server for CPU only.' in script
|
||||||
|
assert 'Install ROCm for AMD GPUs or vLLM/CUDA tooling for NVIDIA' in script
|
||||||
|
|
||||||
def test_cached_model_scan_reports_plain_dir_gguf(tmp_path):
|
def test_cached_model_scan_reports_plain_dir_gguf(tmp_path):
|
||||||
"""Custom download dirs may sit inside the HF hub cache and contain plain
|
"""Custom download dirs may sit inside the HF hub cache and contain plain
|
||||||
per-model folders. They must show up in Serve and keep the GGUF signal."""
|
per-model folders. They must show up in Serve and keep the GGUF signal."""
|
||||||
|
|||||||
Reference in New Issue
Block a user