Merge remote-tracking branch 'origin/main' into visual-pr-playground

# Conflicts:
#	routes/cookbook_routes.py
#	routes/hwfit_routes.py
#	services/hwfit/fit.py
#	services/hwfit/models.py
#	static/js/cookbook-diagnosis.js
#	static/js/cookbook-hwfit.js
#	static/js/cookbook.js
#	static/js/cookbookRunning.js
This commit is contained in:
pewdiepie-archdaemon
2026-06-03 16:49:10 +09:00
569 changed files with 35252 additions and 3489 deletions
+319 -73
View File
@@ -37,7 +37,8 @@ from routes.cookbook_helpers import (
_validate_local_dir, _validate_ssh_port, _validate_gpus, _shell_path,
_ps_squote, _bash_squote, _validate_serve_cmd, _parse_serve_phase,
_safe_env_prefix, _local_tooling_path_export, _append_serve_preflight_exit_lines,
_append_serve_exit_code_lines, _cached_model_scan_script,
_append_serve_exit_code_lines, _append_llama_cpp_linux_accel_build_lines, _cached_model_scan_script,
_ollama_bind_from_cmd, _pip_install_fallback_chain, _pip_install_no_cache, _venv_safe_local_pip_install_cmd,
ModelDownloadRequest, ServeRequest,
)
@@ -148,6 +149,15 @@ def setup_cookbook_routes() -> APIRouter:
"No GPUs are visible to the serve process.",
[{"label": "clear Cookbook GPU selection or choose available GPUs", "op": "settings", "field": "gpus", "value": ""}],
),
(
r"Failed to infer device type|NVML Shared Library Not Found|No module named 'amdsmi'|platform is not available",
"vLLM could not find a supported GPU (CUDA or ROCm). "
"This machine may have integrated or unsupported graphics only.",
[
{"label": "switch to llama.cpp (CPU/Metal, works without a discrete GPU)", "op": "manual"},
{"label": "switch to Ollama (CPU/Metal, works without a discrete GPU)", "op": "manual"},
],
),
(
r"vllm.*command not found|No module named vllm|ERROR: vLLM is not installed",
"vLLM is not installed or not in PATH on this server.",
@@ -163,6 +173,11 @@ def setup_cookbook_routes() -> APIRouter:
"llama.cpp / llama-cpp-python dependencies are missing.",
[{"label": "install llama.cpp dependencies or llama-cpp-python[server]", "op": "dependency", "package": "llama-cpp-python[server]"}],
),
(
r"No GGUF found on this host|no \.gguf file|No GGUF file found",
"No GGUF file found for this model on this host. The llama.cpp backend needs a .gguf file.",
[{"label": "download a GGUF build of this model (repo name usually ends in -GGUF, file like Q4_K_M.gguf)", "op": "manual"}],
),
(
r"No module named 'torch'|No module named torch|No module named 'diffusers'|No module named diffusers",
"Diffusion serving requires PyTorch and diffusers.",
@@ -368,11 +383,15 @@ def setup_cookbook_routes() -> APIRouter:
encoding="utf-8",
)
argv = [os.environ.get("ComSpec", "cmd.exe"), "/c", str(script_path)]
env = os.environ.copy()
env["PYTHONUTF8"] = "1"
env["PYTHONIOENCODING"] = "utf-8"
proc = subprocess.Popen(
argv,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
stdin=subprocess.DEVNULL,
env=env,
**detached_popen_kwargs(),
)
pid_path.write_text(str(proc.pid), encoding="utf-8")
@@ -432,12 +451,12 @@ def setup_cookbook_routes() -> APIRouter:
# throughput. Retries set disable_hf_transfer to fall back to the plain,
# slower-but-reliable downloader (resumes cleanly from the .incomplete files).
# Use `python3 -m pip` not `pip` — macOS has no bare `pip` command.
lines.append("command -v hf >/dev/null 2>&1 || python3 -m pip install --user --break-system-packages -q -U huggingface_hub 2>/dev/null || python3 -m pip install -q -U huggingface_hub 2>/dev/null")
lines.append(f"command -v hf >/dev/null 2>&1 || {_pip_install_fallback_chain('huggingface_hub', upgrade=True)}")
if req.disable_hf_transfer:
lines.append("export HF_HUB_ENABLE_HF_TRANSFER=0")
lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=4")
else:
lines.append("python3 -c 'import hf_transfer' 2>/dev/null || python3 -m pip install --user --break-system-packages -q hf_transfer 2>/dev/null || python3 -m pip install -q hf_transfer 2>/dev/null")
lines.append(f"python3 -c 'import hf_transfer' 2>/dev/null || {_pip_install_fallback_chain('hf_transfer')}")
lines.append("python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=8")
@@ -531,12 +550,18 @@ def setup_cookbook_routes() -> APIRouter:
)
# Ensure pip-user scripts (e.g. hf CLI installed via --user) are on PATH
runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
# Install hf CLI + hf_transfer best-effort so future runs get the fast path.
# Install hf CLI + optional hf_transfer best-effort. Retries disable
# hf_transfer because the Rust parallel path is fast but has been
# flaky near the end of very large multi-file downloads.
# Use --break-system-packages on PEP-668 systems (Arch, newer Debian) so it doesn't bail.
runner_lines.append("command -v hf >/dev/null 2>&1 || pip install --user --break-system-packages -q -U huggingface_hub 2>/dev/null || pip install -q -U huggingface_hub 2>/dev/null")
runner_lines.append("python3 -c 'import hf_transfer' 2>/dev/null || pip install --user --break-system-packages -q hf_transfer 2>/dev/null || pip install -q hf_transfer 2>/dev/null")
runner_lines.append("python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
runner_lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=8")
runner_lines.append(f"command -v hf >/dev/null 2>&1 || {_pip_install_fallback_chain('huggingface_hub', python_cmd='pip', upgrade=True)}")
if req.disable_hf_transfer:
runner_lines.append("export HF_HUB_ENABLE_HF_TRANSFER=0")
runner_lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=4")
else:
runner_lines.append(f"python3 -c 'import hf_transfer' 2>/dev/null || {_pip_install_fallback_chain('hf_transfer', python_cmd='pip')}")
runner_lines.append("python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
runner_lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=8")
# Surface whether the HF token actually reached THIS server, so a gated
# download's "not authorized" failure can be told apart from a missing
# token (the token is masked — we only print applied / not-set).
@@ -547,15 +572,19 @@ def setup_cookbook_routes() -> APIRouter:
runner_lines.append(f' {hf_cmd} < /dev/null')
runner_lines.append('elif python3 -c "import huggingface_hub" 2>/dev/null; then')
runner_lines.append(' echo "hf CLI not found, using Python huggingface_hub..."')
runner_lines.append(f' python3 -c "import os; from huggingface_hub import snapshot_download; snapshot_download(\'{req.repo_id}\'{_dl_pyarg}, max_workers=8)"')
runner_lines.append(f' python3 -c "import os; from huggingface_hub import snapshot_download; snapshot_download(\'{req.repo_id}\'{_dl_pyarg}, max_workers={4 if req.disable_hf_transfer else 8})"')
runner_lines.append('else')
runner_lines.append(' echo "Installing huggingface-hub and dependencies..."')
runner_lines.append(' pip install --no-deps -q huggingface-hub 2>/dev/null')
runner_lines.append(' pip install -q filelock fsspec packaging pyyaml tqdm typer httpx requests hf_transfer 2>/dev/null')
runner_lines.append(" python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
runner_lines.append(f' python3 -c "import os; from huggingface_hub import snapshot_download; snapshot_download(\'{req.repo_id}\'{_dl_pyarg}, max_workers=8)"')
if req.disable_hf_transfer:
runner_lines.append(' pip install -q filelock fsspec packaging pyyaml tqdm typer httpx requests 2>/dev/null')
runner_lines.append(' export HF_HUB_ENABLE_HF_TRANSFER=0')
else:
runner_lines.append(' pip install -q filelock fsspec packaging pyyaml tqdm typer httpx requests hf_transfer 2>/dev/null')
runner_lines.append(" python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
runner_lines.append(f' python3 -c "import os; from huggingface_hub import snapshot_download; snapshot_download(\'{req.repo_id}\'{_dl_pyarg}, max_workers={4 if req.disable_hf_transfer else 8})"')
runner_lines.append('fi')
runner_lines.append('if [ $? -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $?)"; fi')
runner_lines.append('_ec=$?; if [ $_ec -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $_ec)"; fi')
runner_lines.append(f"rm -f {remote_runner}")
runner_lines.append('exec "${SHELL:-/bin/bash}"')
runner_path = TMUX_LOG_DIR / f"{session_id}_run.sh"
@@ -586,11 +615,11 @@ def setup_cookbook_routes() -> APIRouter:
# Detached path: no controlling TTY, so skip `< /dev/null`
# (handled by Popen stdin=DEVNULL) and don't keep a shell open.
lines.append(hf_cmd)
lines.append('if [ $? -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $?)"; fi')
lines.append('_ec=$?; if [ $_ec -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $_ec)"; fi')
else:
# < /dev/null suppresses interactive "update available? [Y/n]" prompt
lines.append(f"{hf_cmd} < /dev/null")
lines.append('if [ $? -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $?)"; fi')
lines.append('_ec=$?; if [ $_ec -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $_ec)"; fi')
lines.append(f"rm -f '{wrapper_script}'")
lines.append('exec "${SHELL:-/bin/bash}"')
wrapper_script.write_text("\n".join(lines) + "\n", encoding="utf-8")
@@ -672,11 +701,14 @@ def setup_cookbook_routes() -> APIRouter:
cwd=str(Path.home()),
)
else:
# LOCAL scan: run the interpreter directly. `python3` isn't a thing on
# Windows (it's `python`/`py`), and shell single-quoting of the path
# doesn't survive cmd.exe — so resolve the interpreter and exec it
# with the script path as an argv element (no shell quoting needed).
local_py = (
# LOCAL scan: use sys.executable (the venv Python Odysseus is already
# running under) — it's guaranteed real Python on all platforms.
# Falling back to which_tool on Windows risks hitting the Microsoft
# Store stub alias for "python3"/"python", which prints
# "Python was not found; run without arguments to install from the
# Microsoft Store" and exits 9009, producing empty stdout and a
# JSON parse error. sys.executable bypasses PATH entirely.
local_py = sys.executable or (
which_tool("python3") or which_tool("python")
or which_tool("py") or "python"
)
@@ -714,6 +746,8 @@ def setup_cookbook_routes() -> APIRouter:
entry["backend"] = m.get("backend")
if m.get("is_ollama"):
entry["is_ollama"] = True
if isinstance(m.get("gguf_files"), list):
entry["gguf_files"] = m["gguf_files"]
models.append(entry)
except Exception as e:
logger.warning(f"Failed to parse cached models: {e}")
@@ -775,6 +809,80 @@ def setup_cookbook_routes() -> APIRouter:
finally:
db.close()
def _auto_register_llm_endpoint(req: ServeRequest, remote: str | None) -> str | None:
"""Register a freshly-served LLM as a model endpoint so it appears in the
model picker without a manual /setup step — the text-model sibling of
_auto_register_image_endpoint.
Cookbook serve commands launch an OpenAI-compatible server (llama.cpp's
llama-server, vLLM, SGLang, or Ollama) on a known port. We point an
endpoint at that server's /v1; the picker auto-discovers the model id by
probing /v1/models and dims the endpoint until the server is reachable,
so registering immediately (before the server finishes loading) is safe.
"""
import re
from core.database import SessionLocal, ModelEndpoint
# Port: an explicit --port wins. Otherwise fall back by backend — Ollama
# is the only server in our generated commands that omits --port.
port_match = re.search(r'--port\s+(\d+)', req.cmd)
if port_match:
port = int(port_match.group(1))
elif "ollama" in req.cmd:
port = 11434
else:
port = 8080 # llama.cpp's llama-server default — the Apple Silicon path
# Determine host (mirrors the image path: SSH alias for remote serves).
if remote:
host = remote.split("@")[-1] if "@" in remote else remote
else:
host = "localhost"
base_url = f"http://{host}:{port}/v1"
short_name = req.repo_id.split("/")[-1] if "/" in req.repo_id else req.repo_id
display_name = short_name or "Local model"
# If the serve command opts models into OpenAI tool-calling, record it so
# agent_loop trusts emitted tool_calls instead of the name heuristic.
supports_tools = True if "--enable-auto-tool-choice" in req.cmd else None
db = SessionLocal()
try:
# Reuse an endpoint already pointed at this URL instead of duplicating.
existing = db.query(ModelEndpoint).filter(ModelEndpoint.base_url == base_url).first()
if existing:
existing.is_enabled = True
existing.model_type = "llm"
existing.name = display_name
if supports_tools is not None:
existing.supports_tools = supports_tools
db.commit()
logger.info(f"Updated existing local model endpoint: {base_url}")
return existing.id
ep_id = f"local-{uuid.uuid4().hex[:8]}"
ep = ModelEndpoint(
id=ep_id,
name=display_name,
base_url=base_url,
api_key=None,
is_enabled=True,
model_type="llm",
supports_tools=supports_tools,
)
db.add(ep)
db.commit()
logger.info(f"Auto-registered local model endpoint: {display_name} @ {base_url}")
return ep_id
except Exception as e:
logger.error(f"Failed to auto-register local model endpoint: {e}")
db.rollback()
return None
finally:
db.close()
@router.post("/api/model/serve")
async def model_serve(request: Request, req: ServeRequest):
"""Launch a model server in a tmux session (or PowerShell background process on Windows).
@@ -800,8 +908,17 @@ def setup_cookbook_routes() -> APIRouter:
# many downstream `"engine" in req.cmd` membership checks can't hit
# `TypeError: argument of type 'NoneType'` (a 500 instead of a clean 400).
req.cmd = _validate_serve_cmd(req.cmd) or ""
req.cmd = _venv_safe_local_pip_install_cmd(
req.cmd,
local=not bool(req.remote_host),
in_venv=sys.prefix != sys.base_prefix,
)
is_pip_install = bool(req.cmd and "pip install" in req.cmd)
if is_pip_install:
# Keep big dependency wheel builds (vLLM, …) off the home filesystem's
# pip cache so they don't fail mid-build with "No space left" (#1219)
# and leave the dep installed-but-unusable (#1459).
req.cmd = _pip_install_no_cache(req.cmd)
# PEP-508-style package spec — letters, digits, `.-_` for the
# name; `[` `]` for extras; `<>=!~,` for version specifiers.
# v2 review HIGH-14: tightened from the previous regex which
@@ -922,7 +1039,7 @@ def setup_cookbook_routes() -> APIRouter:
runner_lines.append(' if ! python3 -c "import llama_cpp" 2>/dev/null; then')
runner_lines.append(' pkg install -y cmake 2>/dev/null')
runner_lines.append(' pip install numpy diskcache jinja2 2>/dev/null')
runner_lines.append(' CMAKE_ARGS="-DGGML_BLAS=OFF -DGGML_LLAMAFILE=OFF" pip install llama-cpp-python --no-build-isolation --no-cache-dir 2>&1 || true')
runner_lines.append(' CMAKE_ARGS="-DGGML_BLAS=OFF -DGGML_LLAMAFILE=OFF" pip install \'llama-cpp-python[server]\' --no-build-isolation --no-cache-dir 2>&1 || true')
runner_lines.append(' fi')
runner_lines.append('elif ! command -v llama-server &>/dev/null; then')
runner_lines.append(' echo "Native llama-server not found — building from source (one-time, may take a few minutes)..."')
@@ -944,61 +1061,45 @@ def setup_cookbook_routes() -> APIRouter:
runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\')
runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
runner_lines.append(' else')
# Detect pip-installed nvcc (from vLLM/nvidia CUDA wheels) and put
# it on PATH so cmake's CUDA configure can find it. We check the
# same three layouts as entrypoint.sh:
# nvidia/cu13 — nvidia-nvcc-cu13
# nvidia/cu12 — nvidia-nvcc-cu12
# nvidia/cuda_nvcc — nvidia-cuda-nvcc-cu12 (sub-package style)
runner_lines.append(' for _cudir in ~/.local/lib/python*/site-packages/nvidia/cu13 ~/.local/lib/python*/site-packages/nvidia/cu12 ~/.local/lib/python*/site-packages/nvidia/cuda_nvcc; do')
runner_lines.append(' [ -x "$_cudir/bin/nvcc" ] && export CUDA_HOME="$_cudir" && export PATH="$_cudir/bin:$PATH" && break')
runner_lines.append(' done')
# rm -rf build so a prior poisoned CMakeCache.txt (e.g. from a
# failed CUDA attempt) doesn't cause the next configure to reuse
# stale settings and silently produce a CPU-only binary.
runner_lines.append(' cd ~/llama.cpp && rm -rf build')
runner_lines.append(' if command -v nvcc &>/dev/null; then')
runner_lines.append(' echo "[odysseus] CUDA nvcc found — building llama-server with CUDA (GPU) support..."')
runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON \\')
runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\')
runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
runner_lines.append(' else')
runner_lines.append(' echo "[odysseus] WARNING: nvcc not found — building llama-server for CPU only."')
runner_lines.append(' echo "[odysseus] GPU inference will not be available for this llama.cpp build."')
runner_lines.append(' echo "[odysseus] To get a GPU build, first install vLLM via Cookbook -> Dependencies"')
runner_lines.append(' echo "[odysseus] (its CUDA wheels include nvcc), then re-launch this serve task."')
runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release \\')
runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\')
runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
runner_lines.append(' fi')
_append_llama_cpp_linux_accel_build_lines(runner_lines)
runner_lines.append(' fi')
runner_lines.append(' # If the native build failed, fall back to the Python bindings.')
runner_lines.append(' if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then')
runner_lines.append(' echo "llama-server build failed — installing Python bindings as fallback..."')
runner_lines.append(' pip install --user --break-system-packages -q llama-cpp-python 2>/dev/null || pip install -q llama-cpp-python 2>/dev/null || true')
runner_lines.append(f" {_pip_install_fallback_chain('llama-cpp-python[server]', python_cmd='pip')} || true")
runner_lines.append(' fi')
runner_lines.append(' if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then')
runner_lines.append(' echo "ERROR: llama.cpp serving is not available after install/build attempts."')
runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127')
runner_lines.append(' fi')
runner_lines.append('fi')
elif "ollama" in req.cmd:
handled_ollama_serve = True
_ollama_port = "11434"
_ollama_match = re.search(r"OLLAMA_HOST=[^\s:]+:(\d+)", req.cmd)
if _ollama_match:
_ollama_port = _ollama_match.group(1)
_ollama_default_host = "0.0.0.0" if remote else "127.0.0.1"
_ollama_host, _ollama_port = _ollama_bind_from_cmd(
req.cmd,
default_host=_ollama_default_host,
)
# Ollama can be a host binary, a system service, or a Docker
# container. If the HTTP API is already reachable, the model is
# already served and we should not require a host `ollama` CLI.
runner_lines.append(f'ODYSSEUS_OLLAMA_HOST={_bash_squote(_ollama_host)}')
runner_lines.append(f'ODYSSEUS_OLLAMA_PORT="{_ollama_port}"')
runner_lines.append('ODYSSEUS_OLLAMA_URL=""')
runner_lines.append('for _ody_ollama_port in "$ODYSSEUS_OLLAMA_PORT" 11434; do')
runner_lines.append(' [ -z "$_ody_ollama_port" ] && continue')
runner_lines.append(' for _ody_ollama_host in 127.0.0.1 localhost host.docker.internal; do')
runner_lines.append(' _ody_ollama_url="http://${_ody_ollama_host}:${_ody_ollama_port}"')
runner_lines.append(' if curl -sf "$_ody_ollama_url/api/tags" >/dev/null 2>&1; then')
runner_lines.append(' ODYSSEUS_OLLAMA_URL="$_ody_ollama_url"')
runner_lines.append(' ODYSSEUS_OLLAMA_PORT="$_ody_ollama_port"')
runner_lines.append(' break 2')
runner_lines.append(' fi')
runner_lines.append('for _ody_ollama_try in $(seq 1 20); do')
runner_lines.append(' for _ody_ollama_port in "$ODYSSEUS_OLLAMA_PORT" 11434; do')
runner_lines.append(' [ -z "$_ody_ollama_port" ] && continue')
runner_lines.append(' for _ody_ollama_host in 127.0.0.1 localhost host.docker.internal; do')
runner_lines.append(' _ody_ollama_url="http://${_ody_ollama_host}:${_ody_ollama_port}"')
runner_lines.append(' if curl -sf "$_ody_ollama_url/api/tags" >/dev/null 2>&1; then')
runner_lines.append(' ODYSSEUS_OLLAMA_URL="$_ody_ollama_url"')
runner_lines.append(' ODYSSEUS_OLLAMA_PORT="$_ody_ollama_port"')
runner_lines.append(' break 3')
runner_lines.append(' fi')
runner_lines.append(' done')
runner_lines.append(' done')
runner_lines.append(' [ "$_ody_ollama_try" -eq 1 ] && echo "[odysseus] Waiting for an existing Ollama API on ports ${ODYSSEUS_OLLAMA_PORT}/11434..."')
runner_lines.append(' sleep 1')
runner_lines.append('done')
runner_lines.append('if [ -n "$ODYSSEUS_OLLAMA_URL" ]; then')
runner_lines.append(' if [ "$ODYSSEUS_OLLAMA_PORT" != "' + _ollama_port + '" ]; then')
@@ -1015,8 +1116,12 @@ def setup_cookbook_routes() -> APIRouter:
runner_lines.append(' echo "=== Process exited with code 127 ==="')
runner_lines.append(' exec bash -i')
runner_lines.append('fi')
runner_lines.append('echo "Starting ollama server on 0.0.0.0:${ODYSSEUS_OLLAMA_PORT}..."')
runner_lines.append('OLLAMA_HOST="0.0.0.0:${ODYSSEUS_OLLAMA_PORT}" ollama serve')
runner_lines.append('ODYSSEUS_OLLAMA_URL="http://${ODYSSEUS_OLLAMA_HOST}:${ODYSSEUS_OLLAMA_PORT}"')
if remote and _ollama_host in ("0.0.0.0", "::"):
runner_lines.append('echo "[odysseus] WARNING: remote Ollama will bind to ${ODYSSEUS_OLLAMA_HOST}:${ODYSSEUS_OLLAMA_PORT} so Odysseus can reach it from this host."')
runner_lines.append('echo "[odysseus] Ollama has no built-in authentication; expose this only on a trusted LAN/VPN or provide an explicit OLLAMA_HOST with your own access controls."')
runner_lines.append('echo "Starting ollama server on ${ODYSSEUS_OLLAMA_HOST}:${ODYSSEUS_OLLAMA_PORT}..."')
runner_lines.append('OLLAMA_HOST="${ODYSSEUS_OLLAMA_HOST}:${ODYSSEUS_OLLAMA_PORT}" ollama serve')
runner_lines.append('_ody_exit=$?')
runner_lines.append('echo')
runner_lines.append('echo "=== Process exited with code ${_ody_exit} ==="')
@@ -1032,19 +1137,24 @@ def setup_cookbook_routes() -> APIRouter:
# find the `vllm` CLI ("command not found"). Mirrors llama.cpp above.
runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
runner_lines.append('if ! command -v vllm &>/dev/null; then')
runner_lines.append(' echo "ERROR: vLLM is not installed. Open Cookbook -> Dependencies and install vllm on this server, then launch again."')
runner_lines.append(' echo "ERROR: vLLM is not installed."')
runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127')
runner_lines.append('fi')
elif "sglang.launch_server" in req.cmd:
runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
runner_lines.append('if ! python3 -c "import sglang" 2>/dev/null; then')
runner_lines.append(' echo "ERROR: SGLang is not installed. Open Cookbook -> Dependencies and install sglang on this server, then launch again."')
runner_lines.append('if ! command -v sglang &>/dev/null; then')
runner_lines.append(' echo "ERROR: SGLang is not installed."')
runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127')
runner_lines.append('elif ! ODYSSEUS_SGLANG_IMPORT_ERROR="$(python3 -c "import sglang" 2>&1)"; then')
runner_lines.append(' echo "ERROR: SGLang is installed but failed to import."')
runner_lines.append(' printf "%s\\n" "$ODYSSEUS_SGLANG_IMPORT_ERROR"')
runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127')
runner_lines.append('fi')
elif "scripts/diffusion_server.py" in req.cmd or ".diffusion_server.py" in req.cmd:
runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
runner_lines.append('if ! python3 -c "import torch, diffusers" 2>/dev/null; then')
runner_lines.append(' echo "ERROR: Diffusion serving requires PyTorch + diffusers. Open Cookbook -> Dependencies and install diffusers on this server, then launch again."')
runner_lines.append('if ! ODYSSEUS_DIFFUSION_IMPORT_ERROR="$(python3 -c "import torch, diffusers" 2>&1)"; then')
runner_lines.append(' echo "ERROR: Diffusion serving requires PyTorch + diffusers."')
runner_lines.append(' printf "%s\\n" "$ODYSSEUS_DIFFUSION_IMPORT_ERROR"')
runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127')
runner_lines.append('fi')
@@ -1116,11 +1226,16 @@ def setup_cookbook_routes() -> APIRouter:
stderr = (await proc.stderr.read()).decode(errors="replace")
return {"ok": False, "error": stderr, "session_id": session_id}
# Auto-register as model endpoint if serving a diffusion model
# Auto-register a model endpoint so the served model shows up in the model
# picker with no manual /setup step. Diffusion models get an image
# endpoint; any other real model serve (i.e. not a pip-install task) gets
# a local LLM endpoint pointed at its /v1.
endpoint_id = None
is_diffusion = "diffusion_server.py" in req.cmd
if is_diffusion:
endpoint_id = _auto_register_image_endpoint(req, remote)
elif not is_pip_install:
endpoint_id = _auto_register_llm_endpoint(req, remote)
# Log to assistant
try:
@@ -1357,9 +1472,16 @@ def setup_cookbook_routes() -> APIRouter:
total_mb = max(0, int(total_bytes / (1024 * 1024)))
used_mb = max(0, min(total_mb, int(used_bytes / (1024 * 1024))))
free_mb = max(0, total_mb - used_mb)
# GTT = the system-RAM pool the GPU pages into when VRAM is full.
# On a discrete card a large gtt_used means the model spilled past
# VRAM into RAM over PCIe — much slower. Surface it so the UI can
# warn "spilling to RAM" instead of the user wondering why it's slow.
gtt_used_raw = await _gpu_read_file(f"{base}/mem_info_gtt_used", host, ssh_port)
gtt_used_mb = max(0, int(int(gtt_used_raw) / (1024 * 1024))) if (gtt_used_raw and gtt_used_raw.isdigit()) else 0
gpus.append({
"index": len(gpus), "name": name, "uuid": entry,
"free_mb": free_mb, "total_mb": total_mb, "used_mb": used_mb,
"gtt_used_mb": gtt_used_mb,
"util_pct": 0, "busy": bool(total_mb and (free_mb / total_mb) < 0.85),
"processes": [], "backend": "rocm", "source": "amd-sysfs",
"unified_memory": unified,
@@ -1461,6 +1583,46 @@ def setup_cookbook_routes() -> APIRouter:
if gpus:
return {"ok": True, "gpus": gpus, "backend": "cuda", "source": "nvidia-smi"}
# Local Apple Silicon / Metal fallback. macOS has no nvidia-smi and no
# Linux /sys/class/drm tree, but services.hwfit.hardware already knows
# how to size the shared unified-memory GPU budget. Keep this route in
# sync so Cookbook's GPU picker doesn't show "nvidia-smi not found" on
# native Mac launches.
if not host and sys.platform == "darwin":
try:
from services.hwfit.hardware import detect_system
info = detect_system(fresh=True)
backend = str(info.get("backend") or "").lower()
if backend in {"metal", "mps", "apple"} and info.get("gpu_count", 0) > 0:
total_mb = int(float(info.get("gpu_vram_gb") or info.get("total_ram_gb") or 0) * 1024)
free_mb = int(float(info.get("available_ram_gb") or 0) * 1024)
if total_mb and (free_mb <= 0 or free_mb > total_mb):
free_mb = total_mb
used_mb = max(0, total_mb - max(0, free_mb))
return {
"ok": True,
"gpus": [{
"index": 0,
"name": info.get("gpu_name") or info.get("cpu_name") or "Apple Silicon GPU",
"uuid": "apple-metal-0",
"free_mb": max(0, free_mb),
"total_mb": max(0, total_mb),
"used_mb": used_mb,
"util_pct": 0,
"busy": bool(total_mb and (free_mb / total_mb) < 0.5),
"processes": [],
"backend": "metal",
"source": "apple-metal",
"unified_memory": True,
}],
"backend": "metal",
"source": "apple-metal",
"fallback_from": "nvidia-smi",
"nvidia_error": nvidia_error,
}
except Exception as e:
logger.warning("Apple Metal GPU fallback failed: %s", e)
amd_gpus = await _probe_amd_sysfs(host, ssh_port)
if amd_gpus:
return {
@@ -1607,6 +1769,33 @@ def setup_cookbook_routes() -> APIRouter:
disk_tasks = on_disk.get("tasks") or [] if isinstance(on_disk, dict) else []
incoming_tasks = data.get("tasks") if isinstance(data.get("tasks"), list) else []
# Anti-poisoning guard: a stale browser tab can keep POSTing a
# download task as status='done' from before the strict-finish
# fix landed, undoing any server-side correction. For each
# incoming "done" download, override to "running" if the last
# shard pattern says N<total AND no DOWNLOAD_OK/DOWNLOAD_FAILED/
# /snapshots/ sentinel is in the output.
import re as _re_dl
for _it in incoming_tasks:
if (not isinstance(_it, dict)) or _it.get("type") != "download" or _it.get("status") != "done":
continue
_out = _it.get("output") or ""
if ("DOWNLOAD_OK" in _out) or ("DOWNLOAD_FAILED" in _out) or ("/snapshots/" in _out):
continue
_shards = _re_dl.findall(r"model-(\d+)-of-(\d+)\.safetensors", _out)
if _shards:
_n, _tot = _shards[-1]
if int(_n) < int(_tot):
logger.info(f"cookbook state POST: rejecting stale done for {_it.get('sessionId')} "
f"(last shard {_n}/{_tot}, no DOWNLOAD_OK)")
_it["status"] = "running"
else:
_completed = _out.count("Download complete")
_starts = _out.count("Downloading '")
if _starts > _completed:
logger.info(f"cookbook state POST: rejecting stale done for {_it.get('sessionId')} "
f"({_completed}/{_starts} files complete, no DOWNLOAD_OK)")
_it["status"] = "running"
incoming_ids = {t.get("sessionId") for t in incoming_tasks if isinstance(t, dict) and t.get("sessionId")}
import time as _t
now_ms = int(_t.time() * 1000)
@@ -1763,6 +1952,43 @@ def setup_cookbook_routes() -> APIRouter:
def _cookbook_tasks_status_sync():
import subprocess
def _download_cache_complete(repo_id: str, remote_host: str = "", ssh_port: str = "") -> bool:
"""Best-effort check for a completed HF cache entry.
tmux output can stop at a stale progress line if the pane/session
disappears before Cookbook captures the final DOWNLOAD_OK marker.
In that case, trust the cache shape: a snapshot directory with files
and no *.incomplete blobs means HuggingFace finished materializing the
model.
"""
if not repo_id or "/" not in repo_id:
return False
py = (
"import os,sys;"
"repo=sys.argv[1];"
"base=os.environ.get('HUGGINGFACE_HUB_CACHE') or os.path.join(os.environ.get('HF_HOME', os.path.expanduser('~/.cache/huggingface')), 'hub');"
"d=os.path.join(base,'models--'+repo.replace('/','--'));"
"snap=os.path.join(d,'snapshots');"
"ok=os.path.isdir(snap) and any(os.path.isdir(os.path.join(snap,x)) and os.listdir(os.path.join(snap,x)) for x in os.listdir(snap));"
"inc=False;"
"blobs=os.path.join(d,'blobs');"
"inc=os.path.isdir(blobs) and any(x.endswith('.incomplete') for x in os.listdir(blobs));"
"sys.exit(0 if ok and not inc else 1)"
)
cmd = ["python3", "-c", py, repo_id]
try:
if remote_host:
ssh_base = ["ssh"]
if ssh_port and ssh_port != "22":
ssh_base.extend(["-p", str(ssh_port)])
shell_cmd = " ".join(shlex.quote(x) for x in cmd)
proc = subprocess.run(ssh_base + [remote_host, shell_cmd], timeout=12, capture_output=True)
else:
proc = subprocess.run(cmd, timeout=12, capture_output=True)
return proc.returncode == 0
except Exception:
return False
# Load saved tasks from cookbook state
tasks = []
if _cookbook_state_path.exists():
@@ -1902,14 +2128,21 @@ def setup_cookbook_routes() -> APIRouter:
# persists after the process exits, so a finished download still has a
# snapshot to classify (DOWNLOAD_OK / exit marker) — evaluate it even
# when the PID is gone instead of blindly reporting "stopped".
download_zero_files = False
status = "unknown"
if is_alive or (local_win_task and full_snapshot):
lower = full_snapshot.lower()
has_exit = "=== process exited with code" in lower
exit_match = re.search(r"=== process exited with code\s+(-?\d+)", full_snapshot, re.I)
has_exit = exit_match is not None
exit_code = int(exit_match.group(1)) if exit_match else None
has_error = "error" in lower or "failed" in lower or "traceback" in lower
if has_exit and task_type == "serve":
# Serve tasks that exit are always errors — they should run indefinitely
status = "error"
elif has_exit and task_type == "download":
# Dependency installs are tracked as download tasks but only
# emit the generic runner exit marker, not HF download markers.
status = "completed" if exit_code == 0 else "error"
elif has_exit and "unrecognized arguments" in lower:
status = "error"
elif has_error and not ("application startup complete" in lower):
@@ -1918,7 +2151,11 @@ def setup_cookbook_routes() -> APIRouter:
# Only download tasks treat 100% as "completed".
# Serve tasks log 100%|██████| during inference progress
# (diffusion sampling, etc.) — that's "running", not done.
status = "completed"
if re.search(r"Fetching\s+0\s+files", full_snapshot, re.IGNORECASE):
status = "error"
download_zero_files = True
else:
status = "completed"
elif "application startup complete" in lower:
status = "ready"
elif not is_alive:
@@ -1928,7 +2165,14 @@ def setup_cookbook_routes() -> APIRouter:
status = "running"
else:
# Session is dead — check if it completed or crashed
status = "stopped"
if task_type == "download" and _download_cache_complete(_payload.get("repo_id") or model, remote, str(_tport or "")):
status = "completed"
if not progress_text:
progress_text = "Download complete"
if not full_snapshot:
full_snapshot = "DOWNLOAD_OK"
else:
status = "stopped"
# Parse structured phase info — single source of truth for the UI
phase_info = _parse_serve_phase(full_snapshot, task_type) if (task_type == "serve" and status == "running" and full_snapshot) else {}
@@ -1938,6 +2182,8 @@ def setup_cookbook_routes() -> APIRouter:
diagnosis = _diagnose_serve_output(full_snapshot) if task_type == "serve" and full_snapshot else None
if diagnosis and status in {"running", "unknown", "stopped"}:
status = "error"
if download_zero_files:
diagnosis = {"message": "No matching files were downloaded. The model repo or filename/quant pattern may be wrong (for example a ':Q4_K_M' tag that does not exist in the repo). Check the repo and the include/quant pattern."}
output_tail = "\n".join(full_snapshot.splitlines()[-12:]) if full_snapshot else ""
results.append({