mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-30 00:22:10 -04:00
Merge remote-tracking branch 'origin/main' into visual-pr-playground
# Conflicts: # routes/cookbook_routes.py # routes/hwfit_routes.py # services/hwfit/fit.py # services/hwfit/models.py # static/js/cookbook-diagnosis.js # static/js/cookbook-hwfit.js # static/js/cookbook.js # static/js/cookbookRunning.js
This commit is contained in:
+319
-73
@@ -37,7 +37,8 @@ from routes.cookbook_helpers import (
|
||||
_validate_local_dir, _validate_ssh_port, _validate_gpus, _shell_path,
|
||||
_ps_squote, _bash_squote, _validate_serve_cmd, _parse_serve_phase,
|
||||
_safe_env_prefix, _local_tooling_path_export, _append_serve_preflight_exit_lines,
|
||||
_append_serve_exit_code_lines, _cached_model_scan_script,
|
||||
_append_serve_exit_code_lines, _append_llama_cpp_linux_accel_build_lines, _cached_model_scan_script,
|
||||
_ollama_bind_from_cmd, _pip_install_fallback_chain, _pip_install_no_cache, _venv_safe_local_pip_install_cmd,
|
||||
ModelDownloadRequest, ServeRequest,
|
||||
)
|
||||
|
||||
@@ -148,6 +149,15 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
"No GPUs are visible to the serve process.",
|
||||
[{"label": "clear Cookbook GPU selection or choose available GPUs", "op": "settings", "field": "gpus", "value": ""}],
|
||||
),
|
||||
(
|
||||
r"Failed to infer device type|NVML Shared Library Not Found|No module named 'amdsmi'|platform is not available",
|
||||
"vLLM could not find a supported GPU (CUDA or ROCm). "
|
||||
"This machine may have integrated or unsupported graphics only.",
|
||||
[
|
||||
{"label": "switch to llama.cpp (CPU/Metal, works without a discrete GPU)", "op": "manual"},
|
||||
{"label": "switch to Ollama (CPU/Metal, works without a discrete GPU)", "op": "manual"},
|
||||
],
|
||||
),
|
||||
(
|
||||
r"vllm.*command not found|No module named vllm|ERROR: vLLM is not installed",
|
||||
"vLLM is not installed or not in PATH on this server.",
|
||||
@@ -163,6 +173,11 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
"llama.cpp / llama-cpp-python dependencies are missing.",
|
||||
[{"label": "install llama.cpp dependencies or llama-cpp-python[server]", "op": "dependency", "package": "llama-cpp-python[server]"}],
|
||||
),
|
||||
(
|
||||
r"No GGUF found on this host|no \.gguf file|No GGUF file found",
|
||||
"No GGUF file found for this model on this host. The llama.cpp backend needs a .gguf file.",
|
||||
[{"label": "download a GGUF build of this model (repo name usually ends in -GGUF, file like Q4_K_M.gguf)", "op": "manual"}],
|
||||
),
|
||||
(
|
||||
r"No module named 'torch'|No module named torch|No module named 'diffusers'|No module named diffusers",
|
||||
"Diffusion serving requires PyTorch and diffusers.",
|
||||
@@ -368,11 +383,15 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
encoding="utf-8",
|
||||
)
|
||||
argv = [os.environ.get("ComSpec", "cmd.exe"), "/c", str(script_path)]
|
||||
env = os.environ.copy()
|
||||
env["PYTHONUTF8"] = "1"
|
||||
env["PYTHONIOENCODING"] = "utf-8"
|
||||
proc = subprocess.Popen(
|
||||
argv,
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
stdin=subprocess.DEVNULL,
|
||||
env=env,
|
||||
**detached_popen_kwargs(),
|
||||
)
|
||||
pid_path.write_text(str(proc.pid), encoding="utf-8")
|
||||
@@ -432,12 +451,12 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
# throughput. Retries set disable_hf_transfer to fall back to the plain,
|
||||
# slower-but-reliable downloader (resumes cleanly from the .incomplete files).
|
||||
# Use `python3 -m pip` not `pip` — macOS has no bare `pip` command.
|
||||
lines.append("command -v hf >/dev/null 2>&1 || python3 -m pip install --user --break-system-packages -q -U huggingface_hub 2>/dev/null || python3 -m pip install -q -U huggingface_hub 2>/dev/null")
|
||||
lines.append(f"command -v hf >/dev/null 2>&1 || {_pip_install_fallback_chain('huggingface_hub', upgrade=True)}")
|
||||
if req.disable_hf_transfer:
|
||||
lines.append("export HF_HUB_ENABLE_HF_TRANSFER=0")
|
||||
lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=4")
|
||||
else:
|
||||
lines.append("python3 -c 'import hf_transfer' 2>/dev/null || python3 -m pip install --user --break-system-packages -q hf_transfer 2>/dev/null || python3 -m pip install -q hf_transfer 2>/dev/null")
|
||||
lines.append(f"python3 -c 'import hf_transfer' 2>/dev/null || {_pip_install_fallback_chain('hf_transfer')}")
|
||||
lines.append("python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
|
||||
lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=8")
|
||||
|
||||
@@ -531,12 +550,18 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
)
|
||||
# Ensure pip-user scripts (e.g. hf CLI installed via --user) are on PATH
|
||||
runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
|
||||
# Install hf CLI + hf_transfer best-effort so future runs get the fast path.
|
||||
# Install hf CLI + optional hf_transfer best-effort. Retries disable
|
||||
# hf_transfer because the Rust parallel path is fast but has been
|
||||
# flaky near the end of very large multi-file downloads.
|
||||
# Use --break-system-packages on PEP-668 systems (Arch, newer Debian) so it doesn't bail.
|
||||
runner_lines.append("command -v hf >/dev/null 2>&1 || pip install --user --break-system-packages -q -U huggingface_hub 2>/dev/null || pip install -q -U huggingface_hub 2>/dev/null")
|
||||
runner_lines.append("python3 -c 'import hf_transfer' 2>/dev/null || pip install --user --break-system-packages -q hf_transfer 2>/dev/null || pip install -q hf_transfer 2>/dev/null")
|
||||
runner_lines.append("python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
|
||||
runner_lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=8")
|
||||
runner_lines.append(f"command -v hf >/dev/null 2>&1 || {_pip_install_fallback_chain('huggingface_hub', python_cmd='pip', upgrade=True)}")
|
||||
if req.disable_hf_transfer:
|
||||
runner_lines.append("export HF_HUB_ENABLE_HF_TRANSFER=0")
|
||||
runner_lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=4")
|
||||
else:
|
||||
runner_lines.append(f"python3 -c 'import hf_transfer' 2>/dev/null || {_pip_install_fallback_chain('hf_transfer', python_cmd='pip')}")
|
||||
runner_lines.append("python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
|
||||
runner_lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=8")
|
||||
# Surface whether the HF token actually reached THIS server, so a gated
|
||||
# download's "not authorized" failure can be told apart from a missing
|
||||
# token (the token is masked — we only print applied / not-set).
|
||||
@@ -547,15 +572,19 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
runner_lines.append(f' {hf_cmd} < /dev/null')
|
||||
runner_lines.append('elif python3 -c "import huggingface_hub" 2>/dev/null; then')
|
||||
runner_lines.append(' echo "hf CLI not found, using Python huggingface_hub..."')
|
||||
runner_lines.append(f' python3 -c "import os; from huggingface_hub import snapshot_download; snapshot_download(\'{req.repo_id}\'{_dl_pyarg}, max_workers=8)"')
|
||||
runner_lines.append(f' python3 -c "import os; from huggingface_hub import snapshot_download; snapshot_download(\'{req.repo_id}\'{_dl_pyarg}, max_workers={4 if req.disable_hf_transfer else 8})"')
|
||||
runner_lines.append('else')
|
||||
runner_lines.append(' echo "Installing huggingface-hub and dependencies..."')
|
||||
runner_lines.append(' pip install --no-deps -q huggingface-hub 2>/dev/null')
|
||||
runner_lines.append(' pip install -q filelock fsspec packaging pyyaml tqdm typer httpx requests hf_transfer 2>/dev/null')
|
||||
runner_lines.append(" python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
|
||||
runner_lines.append(f' python3 -c "import os; from huggingface_hub import snapshot_download; snapshot_download(\'{req.repo_id}\'{_dl_pyarg}, max_workers=8)"')
|
||||
if req.disable_hf_transfer:
|
||||
runner_lines.append(' pip install -q filelock fsspec packaging pyyaml tqdm typer httpx requests 2>/dev/null')
|
||||
runner_lines.append(' export HF_HUB_ENABLE_HF_TRANSFER=0')
|
||||
else:
|
||||
runner_lines.append(' pip install -q filelock fsspec packaging pyyaml tqdm typer httpx requests hf_transfer 2>/dev/null')
|
||||
runner_lines.append(" python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
|
||||
runner_lines.append(f' python3 -c "import os; from huggingface_hub import snapshot_download; snapshot_download(\'{req.repo_id}\'{_dl_pyarg}, max_workers={4 if req.disable_hf_transfer else 8})"')
|
||||
runner_lines.append('fi')
|
||||
runner_lines.append('if [ $? -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $?)"; fi')
|
||||
runner_lines.append('_ec=$?; if [ $_ec -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $_ec)"; fi')
|
||||
runner_lines.append(f"rm -f {remote_runner}")
|
||||
runner_lines.append('exec "${SHELL:-/bin/bash}"')
|
||||
runner_path = TMUX_LOG_DIR / f"{session_id}_run.sh"
|
||||
@@ -586,11 +615,11 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
# Detached path: no controlling TTY, so skip `< /dev/null`
|
||||
# (handled by Popen stdin=DEVNULL) and don't keep a shell open.
|
||||
lines.append(hf_cmd)
|
||||
lines.append('if [ $? -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $?)"; fi')
|
||||
lines.append('_ec=$?; if [ $_ec -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $_ec)"; fi')
|
||||
else:
|
||||
# < /dev/null suppresses interactive "update available? [Y/n]" prompt
|
||||
lines.append(f"{hf_cmd} < /dev/null")
|
||||
lines.append('if [ $? -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $?)"; fi')
|
||||
lines.append('_ec=$?; if [ $_ec -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $_ec)"; fi')
|
||||
lines.append(f"rm -f '{wrapper_script}'")
|
||||
lines.append('exec "${SHELL:-/bin/bash}"')
|
||||
wrapper_script.write_text("\n".join(lines) + "\n", encoding="utf-8")
|
||||
@@ -672,11 +701,14 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
cwd=str(Path.home()),
|
||||
)
|
||||
else:
|
||||
# LOCAL scan: run the interpreter directly. `python3` isn't a thing on
|
||||
# Windows (it's `python`/`py`), and shell single-quoting of the path
|
||||
# doesn't survive cmd.exe — so resolve the interpreter and exec it
|
||||
# with the script path as an argv element (no shell quoting needed).
|
||||
local_py = (
|
||||
# LOCAL scan: use sys.executable (the venv Python Odysseus is already
|
||||
# running under) — it's guaranteed real Python on all platforms.
|
||||
# Falling back to which_tool on Windows risks hitting the Microsoft
|
||||
# Store stub alias for "python3"/"python", which prints
|
||||
# "Python was not found; run without arguments to install from the
|
||||
# Microsoft Store" and exits 9009, producing empty stdout and a
|
||||
# JSON parse error. sys.executable bypasses PATH entirely.
|
||||
local_py = sys.executable or (
|
||||
which_tool("python3") or which_tool("python")
|
||||
or which_tool("py") or "python"
|
||||
)
|
||||
@@ -714,6 +746,8 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
entry["backend"] = m.get("backend")
|
||||
if m.get("is_ollama"):
|
||||
entry["is_ollama"] = True
|
||||
if isinstance(m.get("gguf_files"), list):
|
||||
entry["gguf_files"] = m["gguf_files"]
|
||||
models.append(entry)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse cached models: {e}")
|
||||
@@ -775,6 +809,80 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
def _auto_register_llm_endpoint(req: ServeRequest, remote: str | None) -> str | None:
|
||||
"""Register a freshly-served LLM as a model endpoint so it appears in the
|
||||
model picker without a manual /setup step — the text-model sibling of
|
||||
_auto_register_image_endpoint.
|
||||
|
||||
Cookbook serve commands launch an OpenAI-compatible server (llama.cpp's
|
||||
llama-server, vLLM, SGLang, or Ollama) on a known port. We point an
|
||||
endpoint at that server's /v1; the picker auto-discovers the model id by
|
||||
probing /v1/models and dims the endpoint until the server is reachable,
|
||||
so registering immediately (before the server finishes loading) is safe.
|
||||
"""
|
||||
import re
|
||||
from core.database import SessionLocal, ModelEndpoint
|
||||
|
||||
# Port: an explicit --port wins. Otherwise fall back by backend — Ollama
|
||||
# is the only server in our generated commands that omits --port.
|
||||
port_match = re.search(r'--port\s+(\d+)', req.cmd)
|
||||
if port_match:
|
||||
port = int(port_match.group(1))
|
||||
elif "ollama" in req.cmd:
|
||||
port = 11434
|
||||
else:
|
||||
port = 8080 # llama.cpp's llama-server default — the Apple Silicon path
|
||||
|
||||
# Determine host (mirrors the image path: SSH alias for remote serves).
|
||||
if remote:
|
||||
host = remote.split("@")[-1] if "@" in remote else remote
|
||||
else:
|
||||
host = "localhost"
|
||||
|
||||
base_url = f"http://{host}:{port}/v1"
|
||||
|
||||
short_name = req.repo_id.split("/")[-1] if "/" in req.repo_id else req.repo_id
|
||||
display_name = short_name or "Local model"
|
||||
|
||||
# If the serve command opts models into OpenAI tool-calling, record it so
|
||||
# agent_loop trusts emitted tool_calls instead of the name heuristic.
|
||||
supports_tools = True if "--enable-auto-tool-choice" in req.cmd else None
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
# Reuse an endpoint already pointed at this URL instead of duplicating.
|
||||
existing = db.query(ModelEndpoint).filter(ModelEndpoint.base_url == base_url).first()
|
||||
if existing:
|
||||
existing.is_enabled = True
|
||||
existing.model_type = "llm"
|
||||
existing.name = display_name
|
||||
if supports_tools is not None:
|
||||
existing.supports_tools = supports_tools
|
||||
db.commit()
|
||||
logger.info(f"Updated existing local model endpoint: {base_url}")
|
||||
return existing.id
|
||||
|
||||
ep_id = f"local-{uuid.uuid4().hex[:8]}"
|
||||
ep = ModelEndpoint(
|
||||
id=ep_id,
|
||||
name=display_name,
|
||||
base_url=base_url,
|
||||
api_key=None,
|
||||
is_enabled=True,
|
||||
model_type="llm",
|
||||
supports_tools=supports_tools,
|
||||
)
|
||||
db.add(ep)
|
||||
db.commit()
|
||||
logger.info(f"Auto-registered local model endpoint: {display_name} @ {base_url}")
|
||||
return ep_id
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to auto-register local model endpoint: {e}")
|
||||
db.rollback()
|
||||
return None
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
@router.post("/api/model/serve")
|
||||
async def model_serve(request: Request, req: ServeRequest):
|
||||
"""Launch a model server in a tmux session (or PowerShell background process on Windows).
|
||||
@@ -800,8 +908,17 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
# many downstream `"engine" in req.cmd` membership checks can't hit
|
||||
# `TypeError: argument of type 'NoneType'` (a 500 instead of a clean 400).
|
||||
req.cmd = _validate_serve_cmd(req.cmd) or ""
|
||||
req.cmd = _venv_safe_local_pip_install_cmd(
|
||||
req.cmd,
|
||||
local=not bool(req.remote_host),
|
||||
in_venv=sys.prefix != sys.base_prefix,
|
||||
)
|
||||
is_pip_install = bool(req.cmd and "pip install" in req.cmd)
|
||||
if is_pip_install:
|
||||
# Keep big dependency wheel builds (vLLM, …) off the home filesystem's
|
||||
# pip cache so they don't fail mid-build with "No space left" (#1219)
|
||||
# and leave the dep installed-but-unusable (#1459).
|
||||
req.cmd = _pip_install_no_cache(req.cmd)
|
||||
# PEP-508-style package spec — letters, digits, `.-_` for the
|
||||
# name; `[` `]` for extras; `<>=!~,` for version specifiers.
|
||||
# v2 review HIGH-14: tightened from the previous regex which
|
||||
@@ -922,7 +1039,7 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
runner_lines.append(' if ! python3 -c "import llama_cpp" 2>/dev/null; then')
|
||||
runner_lines.append(' pkg install -y cmake 2>/dev/null')
|
||||
runner_lines.append(' pip install numpy diskcache jinja2 2>/dev/null')
|
||||
runner_lines.append(' CMAKE_ARGS="-DGGML_BLAS=OFF -DGGML_LLAMAFILE=OFF" pip install llama-cpp-python --no-build-isolation --no-cache-dir 2>&1 || true')
|
||||
runner_lines.append(' CMAKE_ARGS="-DGGML_BLAS=OFF -DGGML_LLAMAFILE=OFF" pip install \'llama-cpp-python[server]\' --no-build-isolation --no-cache-dir 2>&1 || true')
|
||||
runner_lines.append(' fi')
|
||||
runner_lines.append('elif ! command -v llama-server &>/dev/null; then')
|
||||
runner_lines.append(' echo "Native llama-server not found — building from source (one-time, may take a few minutes)..."')
|
||||
@@ -944,61 +1061,45 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\')
|
||||
runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
|
||||
runner_lines.append(' else')
|
||||
# Detect pip-installed nvcc (from vLLM/nvidia CUDA wheels) and put
|
||||
# it on PATH so cmake's CUDA configure can find it. We check the
|
||||
# same three layouts as entrypoint.sh:
|
||||
# nvidia/cu13 — nvidia-nvcc-cu13
|
||||
# nvidia/cu12 — nvidia-nvcc-cu12
|
||||
# nvidia/cuda_nvcc — nvidia-cuda-nvcc-cu12 (sub-package style)
|
||||
runner_lines.append(' for _cudir in ~/.local/lib/python*/site-packages/nvidia/cu13 ~/.local/lib/python*/site-packages/nvidia/cu12 ~/.local/lib/python*/site-packages/nvidia/cuda_nvcc; do')
|
||||
runner_lines.append(' [ -x "$_cudir/bin/nvcc" ] && export CUDA_HOME="$_cudir" && export PATH="$_cudir/bin:$PATH" && break')
|
||||
runner_lines.append(' done')
|
||||
# rm -rf build so a prior poisoned CMakeCache.txt (e.g. from a
|
||||
# failed CUDA attempt) doesn't cause the next configure to reuse
|
||||
# stale settings and silently produce a CPU-only binary.
|
||||
runner_lines.append(' cd ~/llama.cpp && rm -rf build')
|
||||
runner_lines.append(' if command -v nvcc &>/dev/null; then')
|
||||
runner_lines.append(' echo "[odysseus] CUDA nvcc found — building llama-server with CUDA (GPU) support..."')
|
||||
runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON \\')
|
||||
runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\')
|
||||
runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
|
||||
runner_lines.append(' else')
|
||||
runner_lines.append(' echo "[odysseus] WARNING: nvcc not found — building llama-server for CPU only."')
|
||||
runner_lines.append(' echo "[odysseus] GPU inference will not be available for this llama.cpp build."')
|
||||
runner_lines.append(' echo "[odysseus] To get a GPU build, first install vLLM via Cookbook -> Dependencies"')
|
||||
runner_lines.append(' echo "[odysseus] (its CUDA wheels include nvcc), then re-launch this serve task."')
|
||||
runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release \\')
|
||||
runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\')
|
||||
runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
|
||||
runner_lines.append(' fi')
|
||||
_append_llama_cpp_linux_accel_build_lines(runner_lines)
|
||||
runner_lines.append(' fi')
|
||||
runner_lines.append(' # If the native build failed, fall back to the Python bindings.')
|
||||
runner_lines.append(' if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then')
|
||||
runner_lines.append(' echo "llama-server build failed — installing Python bindings as fallback..."')
|
||||
runner_lines.append(' pip install --user --break-system-packages -q llama-cpp-python 2>/dev/null || pip install -q llama-cpp-python 2>/dev/null || true')
|
||||
runner_lines.append(f" {_pip_install_fallback_chain('llama-cpp-python[server]', python_cmd='pip')} || true")
|
||||
runner_lines.append(' fi')
|
||||
runner_lines.append(' if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then')
|
||||
runner_lines.append(' echo "ERROR: llama.cpp serving is not available after install/build attempts."')
|
||||
runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127')
|
||||
runner_lines.append(' fi')
|
||||
runner_lines.append('fi')
|
||||
elif "ollama" in req.cmd:
|
||||
handled_ollama_serve = True
|
||||
_ollama_port = "11434"
|
||||
_ollama_match = re.search(r"OLLAMA_HOST=[^\s:]+:(\d+)", req.cmd)
|
||||
if _ollama_match:
|
||||
_ollama_port = _ollama_match.group(1)
|
||||
_ollama_default_host = "0.0.0.0" if remote else "127.0.0.1"
|
||||
_ollama_host, _ollama_port = _ollama_bind_from_cmd(
|
||||
req.cmd,
|
||||
default_host=_ollama_default_host,
|
||||
)
|
||||
# Ollama can be a host binary, a system service, or a Docker
|
||||
# container. If the HTTP API is already reachable, the model is
|
||||
# already served and we should not require a host `ollama` CLI.
|
||||
runner_lines.append(f'ODYSSEUS_OLLAMA_HOST={_bash_squote(_ollama_host)}')
|
||||
runner_lines.append(f'ODYSSEUS_OLLAMA_PORT="{_ollama_port}"')
|
||||
runner_lines.append('ODYSSEUS_OLLAMA_URL=""')
|
||||
runner_lines.append('for _ody_ollama_port in "$ODYSSEUS_OLLAMA_PORT" 11434; do')
|
||||
runner_lines.append(' [ -z "$_ody_ollama_port" ] && continue')
|
||||
runner_lines.append(' for _ody_ollama_host in 127.0.0.1 localhost host.docker.internal; do')
|
||||
runner_lines.append(' _ody_ollama_url="http://${_ody_ollama_host}:${_ody_ollama_port}"')
|
||||
runner_lines.append(' if curl -sf "$_ody_ollama_url/api/tags" >/dev/null 2>&1; then')
|
||||
runner_lines.append(' ODYSSEUS_OLLAMA_URL="$_ody_ollama_url"')
|
||||
runner_lines.append(' ODYSSEUS_OLLAMA_PORT="$_ody_ollama_port"')
|
||||
runner_lines.append(' break 2')
|
||||
runner_lines.append(' fi')
|
||||
runner_lines.append('for _ody_ollama_try in $(seq 1 20); do')
|
||||
runner_lines.append(' for _ody_ollama_port in "$ODYSSEUS_OLLAMA_PORT" 11434; do')
|
||||
runner_lines.append(' [ -z "$_ody_ollama_port" ] && continue')
|
||||
runner_lines.append(' for _ody_ollama_host in 127.0.0.1 localhost host.docker.internal; do')
|
||||
runner_lines.append(' _ody_ollama_url="http://${_ody_ollama_host}:${_ody_ollama_port}"')
|
||||
runner_lines.append(' if curl -sf "$_ody_ollama_url/api/tags" >/dev/null 2>&1; then')
|
||||
runner_lines.append(' ODYSSEUS_OLLAMA_URL="$_ody_ollama_url"')
|
||||
runner_lines.append(' ODYSSEUS_OLLAMA_PORT="$_ody_ollama_port"')
|
||||
runner_lines.append(' break 3')
|
||||
runner_lines.append(' fi')
|
||||
runner_lines.append(' done')
|
||||
runner_lines.append(' done')
|
||||
runner_lines.append(' [ "$_ody_ollama_try" -eq 1 ] && echo "[odysseus] Waiting for an existing Ollama API on ports ${ODYSSEUS_OLLAMA_PORT}/11434..."')
|
||||
runner_lines.append(' sleep 1')
|
||||
runner_lines.append('done')
|
||||
runner_lines.append('if [ -n "$ODYSSEUS_OLLAMA_URL" ]; then')
|
||||
runner_lines.append(' if [ "$ODYSSEUS_OLLAMA_PORT" != "' + _ollama_port + '" ]; then')
|
||||
@@ -1015,8 +1116,12 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
runner_lines.append(' echo "=== Process exited with code 127 ==="')
|
||||
runner_lines.append(' exec bash -i')
|
||||
runner_lines.append('fi')
|
||||
runner_lines.append('echo "Starting ollama server on 0.0.0.0:${ODYSSEUS_OLLAMA_PORT}..."')
|
||||
runner_lines.append('OLLAMA_HOST="0.0.0.0:${ODYSSEUS_OLLAMA_PORT}" ollama serve')
|
||||
runner_lines.append('ODYSSEUS_OLLAMA_URL="http://${ODYSSEUS_OLLAMA_HOST}:${ODYSSEUS_OLLAMA_PORT}"')
|
||||
if remote and _ollama_host in ("0.0.0.0", "::"):
|
||||
runner_lines.append('echo "[odysseus] WARNING: remote Ollama will bind to ${ODYSSEUS_OLLAMA_HOST}:${ODYSSEUS_OLLAMA_PORT} so Odysseus can reach it from this host."')
|
||||
runner_lines.append('echo "[odysseus] Ollama has no built-in authentication; expose this only on a trusted LAN/VPN or provide an explicit OLLAMA_HOST with your own access controls."')
|
||||
runner_lines.append('echo "Starting ollama server on ${ODYSSEUS_OLLAMA_HOST}:${ODYSSEUS_OLLAMA_PORT}..."')
|
||||
runner_lines.append('OLLAMA_HOST="${ODYSSEUS_OLLAMA_HOST}:${ODYSSEUS_OLLAMA_PORT}" ollama serve')
|
||||
runner_lines.append('_ody_exit=$?')
|
||||
runner_lines.append('echo')
|
||||
runner_lines.append('echo "=== Process exited with code ${_ody_exit} ==="')
|
||||
@@ -1032,19 +1137,24 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
# find the `vllm` CLI ("command not found"). Mirrors llama.cpp above.
|
||||
runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
|
||||
runner_lines.append('if ! command -v vllm &>/dev/null; then')
|
||||
runner_lines.append(' echo "ERROR: vLLM is not installed. Open Cookbook -> Dependencies and install vllm on this server, then launch again."')
|
||||
runner_lines.append(' echo "ERROR: vLLM is not installed."')
|
||||
runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127')
|
||||
runner_lines.append('fi')
|
||||
elif "sglang.launch_server" in req.cmd:
|
||||
runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
|
||||
runner_lines.append('if ! python3 -c "import sglang" 2>/dev/null; then')
|
||||
runner_lines.append(' echo "ERROR: SGLang is not installed. Open Cookbook -> Dependencies and install sglang on this server, then launch again."')
|
||||
runner_lines.append('if ! command -v sglang &>/dev/null; then')
|
||||
runner_lines.append(' echo "ERROR: SGLang is not installed."')
|
||||
runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127')
|
||||
runner_lines.append('elif ! ODYSSEUS_SGLANG_IMPORT_ERROR="$(python3 -c "import sglang" 2>&1)"; then')
|
||||
runner_lines.append(' echo "ERROR: SGLang is installed but failed to import."')
|
||||
runner_lines.append(' printf "%s\\n" "$ODYSSEUS_SGLANG_IMPORT_ERROR"')
|
||||
runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127')
|
||||
runner_lines.append('fi')
|
||||
elif "scripts/diffusion_server.py" in req.cmd or ".diffusion_server.py" in req.cmd:
|
||||
runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
|
||||
runner_lines.append('if ! python3 -c "import torch, diffusers" 2>/dev/null; then')
|
||||
runner_lines.append(' echo "ERROR: Diffusion serving requires PyTorch + diffusers. Open Cookbook -> Dependencies and install diffusers on this server, then launch again."')
|
||||
runner_lines.append('if ! ODYSSEUS_DIFFUSION_IMPORT_ERROR="$(python3 -c "import torch, diffusers" 2>&1)"; then')
|
||||
runner_lines.append(' echo "ERROR: Diffusion serving requires PyTorch + diffusers."')
|
||||
runner_lines.append(' printf "%s\\n" "$ODYSSEUS_DIFFUSION_IMPORT_ERROR"')
|
||||
runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127')
|
||||
runner_lines.append('fi')
|
||||
|
||||
@@ -1116,11 +1226,16 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
stderr = (await proc.stderr.read()).decode(errors="replace")
|
||||
return {"ok": False, "error": stderr, "session_id": session_id}
|
||||
|
||||
# Auto-register as model endpoint if serving a diffusion model
|
||||
# Auto-register a model endpoint so the served model shows up in the model
|
||||
# picker with no manual /setup step. Diffusion models get an image
|
||||
# endpoint; any other real model serve (i.e. not a pip-install task) gets
|
||||
# a local LLM endpoint pointed at its /v1.
|
||||
endpoint_id = None
|
||||
is_diffusion = "diffusion_server.py" in req.cmd
|
||||
if is_diffusion:
|
||||
endpoint_id = _auto_register_image_endpoint(req, remote)
|
||||
elif not is_pip_install:
|
||||
endpoint_id = _auto_register_llm_endpoint(req, remote)
|
||||
|
||||
# Log to assistant
|
||||
try:
|
||||
@@ -1357,9 +1472,16 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
total_mb = max(0, int(total_bytes / (1024 * 1024)))
|
||||
used_mb = max(0, min(total_mb, int(used_bytes / (1024 * 1024))))
|
||||
free_mb = max(0, total_mb - used_mb)
|
||||
# GTT = the system-RAM pool the GPU pages into when VRAM is full.
|
||||
# On a discrete card a large gtt_used means the model spilled past
|
||||
# VRAM into RAM over PCIe — much slower. Surface it so the UI can
|
||||
# warn "spilling to RAM" instead of the user wondering why it's slow.
|
||||
gtt_used_raw = await _gpu_read_file(f"{base}/mem_info_gtt_used", host, ssh_port)
|
||||
gtt_used_mb = max(0, int(int(gtt_used_raw) / (1024 * 1024))) if (gtt_used_raw and gtt_used_raw.isdigit()) else 0
|
||||
gpus.append({
|
||||
"index": len(gpus), "name": name, "uuid": entry,
|
||||
"free_mb": free_mb, "total_mb": total_mb, "used_mb": used_mb,
|
||||
"gtt_used_mb": gtt_used_mb,
|
||||
"util_pct": 0, "busy": bool(total_mb and (free_mb / total_mb) < 0.85),
|
||||
"processes": [], "backend": "rocm", "source": "amd-sysfs",
|
||||
"unified_memory": unified,
|
||||
@@ -1461,6 +1583,46 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
if gpus:
|
||||
return {"ok": True, "gpus": gpus, "backend": "cuda", "source": "nvidia-smi"}
|
||||
|
||||
# Local Apple Silicon / Metal fallback. macOS has no nvidia-smi and no
|
||||
# Linux /sys/class/drm tree, but services.hwfit.hardware already knows
|
||||
# how to size the shared unified-memory GPU budget. Keep this route in
|
||||
# sync so Cookbook's GPU picker doesn't show "nvidia-smi not found" on
|
||||
# native Mac launches.
|
||||
if not host and sys.platform == "darwin":
|
||||
try:
|
||||
from services.hwfit.hardware import detect_system
|
||||
info = detect_system(fresh=True)
|
||||
backend = str(info.get("backend") or "").lower()
|
||||
if backend in {"metal", "mps", "apple"} and info.get("gpu_count", 0) > 0:
|
||||
total_mb = int(float(info.get("gpu_vram_gb") or info.get("total_ram_gb") or 0) * 1024)
|
||||
free_mb = int(float(info.get("available_ram_gb") or 0) * 1024)
|
||||
if total_mb and (free_mb <= 0 or free_mb > total_mb):
|
||||
free_mb = total_mb
|
||||
used_mb = max(0, total_mb - max(0, free_mb))
|
||||
return {
|
||||
"ok": True,
|
||||
"gpus": [{
|
||||
"index": 0,
|
||||
"name": info.get("gpu_name") or info.get("cpu_name") or "Apple Silicon GPU",
|
||||
"uuid": "apple-metal-0",
|
||||
"free_mb": max(0, free_mb),
|
||||
"total_mb": max(0, total_mb),
|
||||
"used_mb": used_mb,
|
||||
"util_pct": 0,
|
||||
"busy": bool(total_mb and (free_mb / total_mb) < 0.5),
|
||||
"processes": [],
|
||||
"backend": "metal",
|
||||
"source": "apple-metal",
|
||||
"unified_memory": True,
|
||||
}],
|
||||
"backend": "metal",
|
||||
"source": "apple-metal",
|
||||
"fallback_from": "nvidia-smi",
|
||||
"nvidia_error": nvidia_error,
|
||||
}
|
||||
except Exception as e:
|
||||
logger.warning("Apple Metal GPU fallback failed: %s", e)
|
||||
|
||||
amd_gpus = await _probe_amd_sysfs(host, ssh_port)
|
||||
if amd_gpus:
|
||||
return {
|
||||
@@ -1607,6 +1769,33 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
|
||||
disk_tasks = on_disk.get("tasks") or [] if isinstance(on_disk, dict) else []
|
||||
incoming_tasks = data.get("tasks") if isinstance(data.get("tasks"), list) else []
|
||||
# Anti-poisoning guard: a stale browser tab can keep POSTing a
|
||||
# download task as status='done' from before the strict-finish
|
||||
# fix landed, undoing any server-side correction. For each
|
||||
# incoming "done" download, override to "running" if the last
|
||||
# shard pattern says N<total AND no DOWNLOAD_OK/DOWNLOAD_FAILED/
|
||||
# /snapshots/ sentinel is in the output.
|
||||
import re as _re_dl
|
||||
for _it in incoming_tasks:
|
||||
if (not isinstance(_it, dict)) or _it.get("type") != "download" or _it.get("status") != "done":
|
||||
continue
|
||||
_out = _it.get("output") or ""
|
||||
if ("DOWNLOAD_OK" in _out) or ("DOWNLOAD_FAILED" in _out) or ("/snapshots/" in _out):
|
||||
continue
|
||||
_shards = _re_dl.findall(r"model-(\d+)-of-(\d+)\.safetensors", _out)
|
||||
if _shards:
|
||||
_n, _tot = _shards[-1]
|
||||
if int(_n) < int(_tot):
|
||||
logger.info(f"cookbook state POST: rejecting stale done for {_it.get('sessionId')} "
|
||||
f"(last shard {_n}/{_tot}, no DOWNLOAD_OK)")
|
||||
_it["status"] = "running"
|
||||
else:
|
||||
_completed = _out.count("Download complete")
|
||||
_starts = _out.count("Downloading '")
|
||||
if _starts > _completed:
|
||||
logger.info(f"cookbook state POST: rejecting stale done for {_it.get('sessionId')} "
|
||||
f"({_completed}/{_starts} files complete, no DOWNLOAD_OK)")
|
||||
_it["status"] = "running"
|
||||
incoming_ids = {t.get("sessionId") for t in incoming_tasks if isinstance(t, dict) and t.get("sessionId")}
|
||||
import time as _t
|
||||
now_ms = int(_t.time() * 1000)
|
||||
@@ -1763,6 +1952,43 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
def _cookbook_tasks_status_sync():
|
||||
import subprocess
|
||||
|
||||
def _download_cache_complete(repo_id: str, remote_host: str = "", ssh_port: str = "") -> bool:
|
||||
"""Best-effort check for a completed HF cache entry.
|
||||
|
||||
tmux output can stop at a stale progress line if the pane/session
|
||||
disappears before Cookbook captures the final DOWNLOAD_OK marker.
|
||||
In that case, trust the cache shape: a snapshot directory with files
|
||||
and no *.incomplete blobs means HuggingFace finished materializing the
|
||||
model.
|
||||
"""
|
||||
if not repo_id or "/" not in repo_id:
|
||||
return False
|
||||
py = (
|
||||
"import os,sys;"
|
||||
"repo=sys.argv[1];"
|
||||
"base=os.environ.get('HUGGINGFACE_HUB_CACHE') or os.path.join(os.environ.get('HF_HOME', os.path.expanduser('~/.cache/huggingface')), 'hub');"
|
||||
"d=os.path.join(base,'models--'+repo.replace('/','--'));"
|
||||
"snap=os.path.join(d,'snapshots');"
|
||||
"ok=os.path.isdir(snap) and any(os.path.isdir(os.path.join(snap,x)) and os.listdir(os.path.join(snap,x)) for x in os.listdir(snap));"
|
||||
"inc=False;"
|
||||
"blobs=os.path.join(d,'blobs');"
|
||||
"inc=os.path.isdir(blobs) and any(x.endswith('.incomplete') for x in os.listdir(blobs));"
|
||||
"sys.exit(0 if ok and not inc else 1)"
|
||||
)
|
||||
cmd = ["python3", "-c", py, repo_id]
|
||||
try:
|
||||
if remote_host:
|
||||
ssh_base = ["ssh"]
|
||||
if ssh_port and ssh_port != "22":
|
||||
ssh_base.extend(["-p", str(ssh_port)])
|
||||
shell_cmd = " ".join(shlex.quote(x) for x in cmd)
|
||||
proc = subprocess.run(ssh_base + [remote_host, shell_cmd], timeout=12, capture_output=True)
|
||||
else:
|
||||
proc = subprocess.run(cmd, timeout=12, capture_output=True)
|
||||
return proc.returncode == 0
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
# Load saved tasks from cookbook state
|
||||
tasks = []
|
||||
if _cookbook_state_path.exists():
|
||||
@@ -1902,14 +2128,21 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
# persists after the process exits, so a finished download still has a
|
||||
# snapshot to classify (DOWNLOAD_OK / exit marker) — evaluate it even
|
||||
# when the PID is gone instead of blindly reporting "stopped".
|
||||
download_zero_files = False
|
||||
status = "unknown"
|
||||
if is_alive or (local_win_task and full_snapshot):
|
||||
lower = full_snapshot.lower()
|
||||
has_exit = "=== process exited with code" in lower
|
||||
exit_match = re.search(r"=== process exited with code\s+(-?\d+)", full_snapshot, re.I)
|
||||
has_exit = exit_match is not None
|
||||
exit_code = int(exit_match.group(1)) if exit_match else None
|
||||
has_error = "error" in lower or "failed" in lower or "traceback" in lower
|
||||
if has_exit and task_type == "serve":
|
||||
# Serve tasks that exit are always errors — they should run indefinitely
|
||||
status = "error"
|
||||
elif has_exit and task_type == "download":
|
||||
# Dependency installs are tracked as download tasks but only
|
||||
# emit the generic runner exit marker, not HF download markers.
|
||||
status = "completed" if exit_code == 0 else "error"
|
||||
elif has_exit and "unrecognized arguments" in lower:
|
||||
status = "error"
|
||||
elif has_error and not ("application startup complete" in lower):
|
||||
@@ -1918,7 +2151,11 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
# Only download tasks treat 100% as "completed".
|
||||
# Serve tasks log 100%|██████| during inference progress
|
||||
# (diffusion sampling, etc.) — that's "running", not done.
|
||||
status = "completed"
|
||||
if re.search(r"Fetching\s+0\s+files", full_snapshot, re.IGNORECASE):
|
||||
status = "error"
|
||||
download_zero_files = True
|
||||
else:
|
||||
status = "completed"
|
||||
elif "application startup complete" in lower:
|
||||
status = "ready"
|
||||
elif not is_alive:
|
||||
@@ -1928,7 +2165,14 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
status = "running"
|
||||
else:
|
||||
# Session is dead — check if it completed or crashed
|
||||
status = "stopped"
|
||||
if task_type == "download" and _download_cache_complete(_payload.get("repo_id") or model, remote, str(_tport or "")):
|
||||
status = "completed"
|
||||
if not progress_text:
|
||||
progress_text = "Download complete"
|
||||
if not full_snapshot:
|
||||
full_snapshot = "DOWNLOAD_OK"
|
||||
else:
|
||||
status = "stopped"
|
||||
|
||||
# Parse structured phase info — single source of truth for the UI
|
||||
phase_info = _parse_serve_phase(full_snapshot, task_type) if (task_type == "serve" and status == "running" and full_snapshot) else {}
|
||||
@@ -1938,6 +2182,8 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
diagnosis = _diagnose_serve_output(full_snapshot) if task_type == "serve" and full_snapshot else None
|
||||
if diagnosis and status in {"running", "unknown", "stopped"}:
|
||||
status = "error"
|
||||
if download_zero_files:
|
||||
diagnosis = {"message": "No matching files were downloaded. The model repo or filename/quant pattern may be wrong (for example a ':Q4_K_M' tag that does not exist in the repo). Check the repo and the include/quant pattern."}
|
||||
output_tail = "\n".join(full_snapshot.splitlines()[-12:]) if full_snapshot else ""
|
||||
|
||||
results.append({
|
||||
|
||||
Reference in New Issue
Block a user