mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-17 18:25:26 -04:00
Merge remote-tracking branch 'origin/dev' into test-main-dev-merge-20260615
# Conflicts: # src/tool_implementations.py # static/js/research/panel.js
This commit is contained in:
+94
-73
@@ -15,9 +15,11 @@ from pathlib import Path
|
||||
from fastapi import APIRouter, HTTPException, Request, Depends
|
||||
|
||||
from src.auth_helpers import require_user
|
||||
from src.constants import COOKBOOK_STATE_FILE
|
||||
from pydantic import BaseModel
|
||||
|
||||
from core.middleware import require_admin
|
||||
from routes._validators import validate_remote_host, validate_ssh_port
|
||||
from core.platform_compat import (
|
||||
IS_WINDOWS,
|
||||
detached_popen_kwargs,
|
||||
@@ -28,18 +30,26 @@ from core.platform_compat import (
|
||||
which_tool,
|
||||
)
|
||||
from routes.shell_routes import TMUX_LOG_DIR
|
||||
from routes.cookbook_output import (
|
||||
error_aware_output_tail, classify_dead_download,
|
||||
HF_CACHE_COMPLETE_PROBE, HF_CACHE_INCOMPLETE_PROBE,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
from routes.cookbook_helpers import (
|
||||
_SSH_PORT_RE, _REMOTE_HOST_RE, _SESSION_ID_RE,
|
||||
_validate_repo_id, _validate_serve_model_id, _validate_include, _validate_remote_host, _validate_token,
|
||||
_validate_local_dir, _validate_ssh_port, _validate_gpus, _shell_path,
|
||||
_SESSION_ID_RE, _validate_repo_id, _validate_serve_model_id, _validate_include, _validate_token,
|
||||
_validate_local_dir, _validate_gpus, _shell_path,
|
||||
_ps_squote, _bash_squote, _validate_serve_cmd, _parse_serve_phase,
|
||||
_safe_env_prefix, _local_tooling_path_export, _append_serve_preflight_exit_lines,
|
||||
_append_serve_exit_code_lines, _append_llama_cpp_linux_accel_build_lines, _cached_model_scan_script,
|
||||
load_stored_hf_token,
|
||||
_append_vllm_linux_preflight_lines, _ollama_bind_from_cmd, _pip_install_fallback_chain,
|
||||
_pip_install_no_cache, _user_shell_path_bootstrap, _venv_safe_local_pip_install_cmd,
|
||||
_diagnose_serve_output, run_ssh_command_async,
|
||||
_ollama_bind_from_cmd, _pip_install_fallback_chain, _pip_install_no_cache,
|
||||
_user_shell_path_bootstrap, _venv_safe_local_pip_install_cmd,
|
||||
_normalize_llama_cpp_python_cache_types,
|
||||
ModelDownloadRequest, ServeRequest,
|
||||
)
|
||||
|
||||
@@ -48,13 +58,13 @@ _HF_TOKEN_STATUS_SNIPPET = (
|
||||
'echo "[odysseus] HF token: applied"; '
|
||||
'else '
|
||||
'echo "[odysseus] HF token: NOT SET — gated/private models will be denied. '
|
||||
'Add one in Odysseus Settings -> Cookbook -> HuggingFace Token."; '
|
||||
'Add one in Odysseus Cookbook -> Settings -> HuggingFace Token."; '
|
||||
'fi'
|
||||
)
|
||||
|
||||
def setup_cookbook_routes() -> APIRouter:
|
||||
router = APIRouter(tags=["cookbook"])
|
||||
_cookbook_state_path = Path(os.environ.get("DATA_DIR", "data")) / "cookbook_state.json"
|
||||
_cookbook_state_path = Path(COOKBOOK_STATE_FILE)
|
||||
|
||||
def _mask_secret(value: str) -> str:
|
||||
if not value:
|
||||
@@ -164,6 +174,16 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
"vLLM is not installed or not in PATH on this server.",
|
||||
[{"label": "install vLLM in Cookbook Dependencies", "op": "dependency", "package": "vllm"}],
|
||||
),
|
||||
(
|
||||
r"sgl_kernel[\s\S]*(Python\.h|libnuma\.so\.1|common_ops)|"
|
||||
r"(Python\.h|libnuma\.so\.1|common_ops)[\s\S]*sgl_kernel|"
|
||||
r"Please ensure sgl_kernel is properly installed",
|
||||
"SGLang native dependencies are missing on this server.",
|
||||
[
|
||||
{"label": "install OS packages: libnuma-dev python3.12-dev build-essential", "op": "manual"},
|
||||
{"label": "upgrade sglang-kernel after OS packages are installed", "op": "manual"},
|
||||
],
|
||||
),
|
||||
(
|
||||
r"sglang.*command not found|No module named sglang|SGLang is not installed",
|
||||
"SGLang is not installed or not in PATH on this server.",
|
||||
@@ -232,14 +252,7 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
return state
|
||||
|
||||
def _load_stored_hf_token() -> str:
|
||||
if not _cookbook_state_path.exists():
|
||||
return ""
|
||||
try:
|
||||
state = json.loads(_cookbook_state_path.read_text(encoding="utf-8"))
|
||||
env = state.get("env") if isinstance(state, dict) else {}
|
||||
return _decrypt_secret(env.get("hfToken") if isinstance(env, dict) else "")
|
||||
except Exception:
|
||||
return ""
|
||||
return load_stored_hf_token(state_path=_cookbook_state_path)
|
||||
|
||||
def _cookbook_ssh_dir() -> Path:
|
||||
# The Docker image keeps cookbook keys under /app/.ssh; that path only
|
||||
@@ -354,7 +367,11 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
# all output to the log the poller reads. Paths handed to bash use
|
||||
# POSIX form + shell-quoting so drive paths / spaces survive.
|
||||
inner = TMUX_LOG_DIR / f"{session_id}_run.sh"
|
||||
inner.write_text("\n".join(bash_lines) + "\n", encoding="utf-8")
|
||||
pp = shlex.quote(pid_path.as_posix())
|
||||
inner.write_text(
|
||||
f"printf '%s\\n' \"$$\" > {pp}\n" + "\n".join(bash_lines) + "\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
lp = shlex.quote(log_path.as_posix())
|
||||
ip = shlex.quote(inner.as_posix())
|
||||
script_path = TMUX_LOG_DIR / f"{session_id}.sh"
|
||||
@@ -406,8 +423,8 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
else:
|
||||
_validate_repo_id(req.repo_id)
|
||||
_validate_include(req.include)
|
||||
_validate_remote_host(req.remote_host)
|
||||
req.ssh_port = _validate_ssh_port(req.ssh_port)
|
||||
validate_remote_host(req.remote_host)
|
||||
req.ssh_port = validate_ssh_port(req.ssh_port)
|
||||
req.local_dir = _validate_local_dir(req.local_dir)
|
||||
req.hf_token = "" if is_ollama_download else (req.hf_token or _load_stored_hf_token())
|
||||
_validate_token(req.hf_token)
|
||||
@@ -738,9 +755,8 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
# Validate shell-bound inputs, matching the sibling list_gpus endpoint —
|
||||
# `host`/`ssh_port` are interpolated into an ssh command below, so an
|
||||
# unvalidated value (e.g. "x'; rm -rf ~ #") would be command injection.
|
||||
host = _validate_remote_host(host)
|
||||
if ssh_port is not None and ssh_port != "" and not _SSH_PORT_RE.fullmatch(ssh_port):
|
||||
raise HTTPException(400, "Invalid ssh_port")
|
||||
host = validate_remote_host(host)
|
||||
ssh_port = validate_ssh_port(ssh_port)
|
||||
TMUX_LOG_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
model_dirs = []
|
||||
@@ -889,11 +905,16 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
# listening" check without requiring ss/netstat/nmap.
|
||||
ssh_base = ["ssh", "-o", "ConnectTimeout=4", "-o", "StrictHostKeyChecking=no"]
|
||||
if ssh_port and str(ssh_port) != "22":
|
||||
if not _SSH_PORT_RE.match(str(ssh_port)):
|
||||
try:
|
||||
ssh_port = validate_ssh_port(ssh_port)
|
||||
except HTTPException:
|
||||
return None
|
||||
ssh_base.extend(["-p", str(ssh_port)])
|
||||
host_arg = remote
|
||||
if not _REMOTE_HOST_RE.match(host_arg):
|
||||
try:
|
||||
host_arg = validate_remote_host(remote)
|
||||
except HTTPException:
|
||||
return None
|
||||
if not host_arg:
|
||||
return None
|
||||
probe_ports = " ".join(str(start_port + i) for i in range(max_offset + 1))
|
||||
script = (
|
||||
@@ -1196,8 +1217,8 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
"""
|
||||
require_admin(request)
|
||||
# Defence-in-depth: reject values that could break out of shell contexts.
|
||||
_validate_remote_host(req.remote_host)
|
||||
req.ssh_port = _validate_ssh_port(req.ssh_port)
|
||||
validate_remote_host(req.remote_host)
|
||||
req.ssh_port = validate_ssh_port(req.ssh_port)
|
||||
req.gpus = _validate_gpus(req.gpus)
|
||||
req.hf_token = req.hf_token or _load_stored_hf_token()
|
||||
_validate_token(req.hf_token)
|
||||
@@ -1208,6 +1229,7 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
# many downstream `"engine" in req.cmd` membership checks can't hit
|
||||
# `TypeError: argument of type 'NoneType'` (a 500 instead of a clean 400).
|
||||
req.cmd = _validate_serve_cmd(req.cmd) or ""
|
||||
req.cmd = _normalize_llama_cpp_python_cache_types(req.cmd) or ""
|
||||
req.cmd = _venv_safe_local_pip_install_cmd(
|
||||
req.cmd,
|
||||
local=not bool(req.remote_host),
|
||||
@@ -1637,12 +1659,11 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
async def server_setup(request: Request, req: SetupRequest):
|
||||
"""Install required dependencies on a remote server via SSH."""
|
||||
require_admin(request)
|
||||
host = _validate_remote_host(req.host)
|
||||
host = validate_remote_host(req.host)
|
||||
if not host:
|
||||
raise HTTPException(400, "host is required")
|
||||
port = req.ssh_port
|
||||
if port is not None and port != "" and not re.fullmatch(r"\d{1,5}", port):
|
||||
raise HTTPException(400, "Invalid ssh_port")
|
||||
port = validate_ssh_port(port)
|
||||
pf = f"-p {port} " if port and port != "22" else ""
|
||||
|
||||
# Detect platform: Windows first (echo %OS% → Windows_NT), then Termux, then Linux
|
||||
@@ -1886,9 +1907,8 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
`busy` is True when free_mb/total_mb < 0.5.
|
||||
"""
|
||||
require_admin(request)
|
||||
host = _validate_remote_host(host)
|
||||
if ssh_port is not None and ssh_port != "" and not _SSH_PORT_RE.fullmatch(ssh_port):
|
||||
raise HTTPException(400, "Invalid ssh_port")
|
||||
host = validate_remote_host(host)
|
||||
ssh_port = validate_ssh_port(ssh_port)
|
||||
gpu_query = "nvidia-smi --query-gpu=index,name,memory.free,memory.total,memory.used,utilization.gpu,uuid --format=csv,noheader,nounits"
|
||||
nvidia_error = None
|
||||
try:
|
||||
@@ -2045,9 +2065,8 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
sig = (req.signal or "TERM").upper()
|
||||
if sig not in ("TERM", "KILL", "INT"):
|
||||
raise HTTPException(400, "signal must be TERM, KILL, or INT")
|
||||
host = _validate_remote_host(req.host)
|
||||
if req.ssh_port and not _SSH_PORT_RE.fullmatch(req.ssh_port):
|
||||
raise HTTPException(400, "Invalid ssh_port")
|
||||
host = validate_remote_host(req.host)
|
||||
req.ssh_port = validate_ssh_port(req.ssh_port)
|
||||
kill_cmd = f"kill -{sig} {req.pid}"
|
||||
try:
|
||||
if host:
|
||||
@@ -2381,14 +2400,19 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
host = (srv.get("host") or "").strip()
|
||||
if not host:
|
||||
continue # local-only entry; the /proc scan handles it
|
||||
if not _REMOTE_HOST_RE.match(host):
|
||||
try:
|
||||
host = validate_remote_host(host)
|
||||
except HTTPException:
|
||||
continue
|
||||
sport = str(srv.get("port") or "").strip()
|
||||
ssh_base = ["ssh", "-o", "ConnectTimeout=4", "-o", "StrictHostKeyChecking=no"]
|
||||
if sport and sport != "22":
|
||||
if not _SSH_PORT_RE.match(sport):
|
||||
try:
|
||||
sport = validate_ssh_port(sport)
|
||||
except HTTPException:
|
||||
continue
|
||||
ssh_base.extend(["-p", sport])
|
||||
if sport != "22":
|
||||
ssh_base.extend(["-p", sport])
|
||||
|
||||
try:
|
||||
ls = subprocess.run(
|
||||
@@ -2802,30 +2826,20 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
def _cookbook_tasks_status_sync():
|
||||
import subprocess
|
||||
|
||||
def _download_cache_complete(repo_id: str, remote_host: str = "", ssh_port: str = "") -> bool:
|
||||
def _download_cache_complete(repo_id: str, remote_host: str = "", ssh_port: str = "", cache_root: str = "") -> bool:
|
||||
"""Best-effort check for a completed HF cache entry.
|
||||
|
||||
tmux output can stop at a stale progress line if the pane/session
|
||||
disappears before Cookbook captures the final DOWNLOAD_OK marker.
|
||||
In that case, trust the cache shape: a snapshot directory with files
|
||||
and no *.incomplete blobs means HuggingFace finished materializing the
|
||||
model.
|
||||
model. cache_root is the task's custom download dir — the runner
|
||||
pointed HF_HOME there, so the cache lives under <cache_root>/hub,
|
||||
not wherever this probe's environment says.
|
||||
"""
|
||||
if not repo_id or "/" not in repo_id:
|
||||
return False
|
||||
py = (
|
||||
"import os,sys;"
|
||||
"repo=sys.argv[1];"
|
||||
"base=os.environ.get('HUGGINGFACE_HUB_CACHE') or os.path.join(os.environ.get('HF_HOME', os.path.expanduser('~/.cache/huggingface')), 'hub');"
|
||||
"d=os.path.join(base,'models--'+repo.replace('/','--'));"
|
||||
"snap=os.path.join(d,'snapshots');"
|
||||
"ok=os.path.isdir(snap) and any(os.path.isdir(os.path.join(snap,x)) and os.listdir(os.path.join(snap,x)) for x in os.listdir(snap));"
|
||||
"inc=False;"
|
||||
"blobs=os.path.join(d,'blobs');"
|
||||
"inc=os.path.isdir(blobs) and any(x.endswith('.incomplete') for x in os.listdir(blobs));"
|
||||
"sys.exit(0 if ok and not inc else 1)"
|
||||
)
|
||||
cmd = ["python3", "-c", py, repo_id]
|
||||
cmd = ["python3", "-c", HF_CACHE_COMPLETE_PROBE, repo_id, cache_root or ""]
|
||||
try:
|
||||
if remote_host:
|
||||
ssh_base = ["ssh"]
|
||||
@@ -2839,7 +2853,7 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _download_cache_incomplete(repo_id: str, remote_host: str = "", ssh_port: str = "") -> bool:
|
||||
def _download_cache_incomplete(repo_id: str, remote_host: str = "", ssh_port: str = "", cache_root: str = "") -> bool:
|
||||
"""Best-effort check for resumable HF partial blobs.
|
||||
|
||||
A lost SSH/tmux session can leave a real download still incomplete.
|
||||
@@ -2848,16 +2862,7 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
"""
|
||||
if not repo_id or "/" not in repo_id:
|
||||
return False
|
||||
py = (
|
||||
"import os,sys;"
|
||||
"repo=sys.argv[1];"
|
||||
"base=os.environ.get('HUGGINGFACE_HUB_CACHE') or os.path.join(os.environ.get('HF_HOME', os.path.expanduser('~/.cache/huggingface')), 'hub');"
|
||||
"d=os.path.join(base,'models--'+repo.replace('/','--'));"
|
||||
"blobs=os.path.join(d,'blobs');"
|
||||
"inc=os.path.isdir(blobs) and any(x.endswith('.incomplete') for x in os.listdir(blobs));"
|
||||
"sys.exit(0 if inc else 1)"
|
||||
)
|
||||
cmd = ["python3", "-c", py, repo_id]
|
||||
cmd = ["python3", "-c", HF_CACHE_INCOMPLETE_PROBE, repo_id, cache_root or ""]
|
||||
try:
|
||||
if remote_host:
|
||||
ssh_base = ["ssh"]
|
||||
@@ -2929,12 +2934,18 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
if not _SESSION_ID_RE.match(session_id):
|
||||
logger.warning(f"Skipping task with unsafe session_id: {session_id!r}")
|
||||
continue
|
||||
if remote and not _REMOTE_HOST_RE.match(remote):
|
||||
logger.warning(f"Skipping task with unsafe remoteHost: {remote!r}")
|
||||
continue
|
||||
if _tport and not _SSH_PORT_RE.match(str(_tport)):
|
||||
logger.warning(f"Skipping task with unsafe sshPort: {_tport!r}")
|
||||
continue
|
||||
if remote:
|
||||
try:
|
||||
remote = validate_remote_host(remote)
|
||||
except HTTPException:
|
||||
logger.warning(f"Skipping task with unsafe remoteHost: {remote!r}")
|
||||
continue
|
||||
if _tport:
|
||||
try:
|
||||
_tport = validate_ssh_port(str(_tport))
|
||||
except HTTPException:
|
||||
logger.warning(f"Skipping task with unsafe sshPort: {_tport!r}")
|
||||
continue
|
||||
if task_platform == "windows" and remote:
|
||||
# Windows: check PID file + Get-Process, read log tail
|
||||
sd = "$env:TEMP\\odysseus-sessions"
|
||||
@@ -3047,6 +3058,7 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
# snapshot to classify (DOWNLOAD_OK / exit marker) — evaluate it even
|
||||
# when the PID is gone instead of blindly reporting "stopped".
|
||||
download_zero_files = False
|
||||
exit_code = None
|
||||
status = "unknown"
|
||||
download_has_ok = task_type == "download" and "DOWNLOAD_OK" in full_snapshot
|
||||
download_has_failed = task_type == "download" and "DOWNLOAD_FAILED" in full_snapshot
|
||||
@@ -3055,7 +3067,7 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
and (
|
||||
".incomplete" in full_snapshot
|
||||
or bool(re.search(r'model-\d+-of-\d+\.[A-Za-z0-9_.-]+:\s+(?:[0-9]|[1-8][0-9])%', full_snapshot))
|
||||
or _download_cache_incomplete(_payload.get("repo_id") or model, remote, str(_tport or ""))
|
||||
or _download_cache_incomplete(_payload.get("repo_id") or model, remote, str(_tport or ""), _payload.get("local_dir") or "")
|
||||
)
|
||||
)
|
||||
if is_alive or (local_win_task and full_snapshot):
|
||||
@@ -3096,11 +3108,19 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
else:
|
||||
status = "running"
|
||||
else:
|
||||
# Session is dead — check if it completed or crashed
|
||||
if (
|
||||
# Session is dead — check if it completed or crashed. The
|
||||
# runner markers in the retained output are conclusive
|
||||
# (DOWNLOAD_OK only prints after exit 0), so check them before
|
||||
# the cache probe, which can't see ollama pulls at all.
|
||||
marker = classify_dead_download(full_snapshot) if task_type == "download" else None
|
||||
if marker is not None:
|
||||
status, download_zero_files = marker
|
||||
if status == "completed" and not progress_text:
|
||||
progress_text = "Download complete"
|
||||
elif (
|
||||
task_type == "download"
|
||||
and not download_has_incomplete_evidence
|
||||
and _download_cache_complete(_payload.get("repo_id") or model, remote, str(_tport or ""))
|
||||
and _download_cache_complete(_payload.get("repo_id") or model, remote, str(_tport or ""), _payload.get("local_dir") or "")
|
||||
):
|
||||
status = "completed"
|
||||
if not progress_text:
|
||||
@@ -3120,7 +3140,7 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
status = "error"
|
||||
if download_zero_files:
|
||||
diagnosis = {"message": "No matching files were downloaded. The model repo or filename/quant pattern may be wrong (for example a ':Q4_K_M' tag that does not exist in the repo). Check the repo and the include/quant pattern."}
|
||||
output_tail = "\n".join(full_snapshot.splitlines()[-12:]) if full_snapshot else ""
|
||||
output_tail = error_aware_output_tail(full_snapshot, status)
|
||||
|
||||
results.append({
|
||||
"session_id": session_id,
|
||||
@@ -3131,6 +3151,7 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
"phase": serve_phase,
|
||||
"diagnosis": diagnosis,
|
||||
"output_tail": output_tail,
|
||||
"exit_code": exit_code,
|
||||
"cmd": _payload.get("_cmd") or "",
|
||||
"tps": phase_info.get("tps"),
|
||||
"reqs": phase_info.get("reqs"),
|
||||
|
||||
Reference in New Issue
Block a user