diff --git a/Dockerfile b/Dockerfile index 996e06faa..bed5e2002 100644 --- a/Dockerfile +++ b/Dockerfile @@ -20,6 +20,23 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ gosu \ && rm -rf /var/lib/apt/lists/* +# Docker CLI (client only — daemon stays on the host via the +# /var/run/docker.sock mount). The Debian `docker.io` package ships +# dockerd but not the client binary on slim, so grab the static client +# tarball from download.docker.com instead. +ARG DOCKER_CLI_VERSION=27.5.1 +RUN ARCH="$(dpkg --print-architecture)" \ + && case "$ARCH" in \ + amd64) DARCH=x86_64 ;; \ + arm64) DARCH=aarch64 ;; \ + *) echo "unsupported arch $ARCH"; exit 1 ;; \ + esac \ + && curl -fsSL "https://download.docker.com/linux/static/stable/${DARCH}/docker-${DOCKER_CLI_VERSION}.tgz" \ + -o /tmp/docker.tgz \ + && tar -xzf /tmp/docker.tgz -C /tmp \ + && install -m 0755 /tmp/docker/docker /usr/local/bin/docker \ + && rm -rf /tmp/docker /tmp/docker.tgz + WORKDIR /app # Install Python deps first (layer cache). Optional extras (PyMuPDF AGPL, etc.) diff --git a/docker-compose.gpu-amd.yml b/docker-compose.gpu-amd.yml index 82e22e440..5d5f8427e 100644 --- a/docker-compose.gpu-amd.yml +++ b/docker-compose.gpu-amd.yml @@ -28,6 +28,14 @@ services: # land under /app/.local for the odysseus user. Persist them so a # container recreate does not silently remove installed serve engines. - ${APP_DATA_DIR:-./data}/local:/app/.local:z + # Docker socket — lets Cookbook launch commands like + # `docker exec ollama-rocm ollama show ` reach the host's + # Docker daemon (and sibling containers like ollama-rocm / + # ollama-test). The in-container user needs to be in the + # socket's owning group — see `group_add` below; the GID + # there must match the host's `docker` group (defaults to 963 + # on Debian, 999 on Ubuntu — override via env if yours differs). + - /var/run/docker.sock:/var/run/docker.sock extra_hosts: # Lets the container reach local services on the Docker host, including # Ollama at http://host.docker.internal:11434. @@ -93,6 +101,7 @@ services: - /dev/kfd - /dev/dri group_add: + - "${DOCKER_GID:-963}" - video - ${RENDER_GID:-render} diff --git a/docker-compose.gpu-nvidia.yml b/docker-compose.gpu-nvidia.yml index 1b551c669..c1f2cddb0 100644 --- a/docker-compose.gpu-nvidia.yml +++ b/docker-compose.gpu-nvidia.yml @@ -27,6 +27,16 @@ services: # land under /app/.local for the odysseus user. Persist them so a # container recreate does not silently remove installed serve engines. - ${APP_DATA_DIR:-./data}/local:/app/.local:z + # Docker socket — lets Cookbook launch commands like + # `docker exec ollama-rocm ollama show ` reach the host's + # Docker daemon (and sibling containers like ollama-rocm / + # ollama-test). The in-container user needs to be in the + # socket's owning group — see `group_add` below; the GID + # there must match the host's `docker` group (defaults to 963 + # on Debian, 999 on Ubuntu — override via env if yours differs). + - /var/run/docker.sock:/var/run/docker.sock + group_add: + - "${DOCKER_GID:-963}" extra_hosts: # Lets the container reach local services on the Docker host, including # Ollama at http://host.docker.internal:11434. diff --git a/docker-compose.yml b/docker-compose.yml index cbeec1e37..77840e22b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -16,6 +16,16 @@ services: # land under /app/.local for the odysseus user. Persist them so a # container recreate does not silently remove installed serve engines. - ${APP_DATA_DIR:-./data}/local:/app/.local:z + # Docker socket — lets Cookbook launch commands like + # `docker exec ollama-rocm ollama show ` reach the host's + # Docker daemon (and sibling containers like ollama-rocm / + # ollama-test). The in-container user needs to be in the + # socket's owning group — see `group_add` below; the GID + # there must match the host's `docker` group (defaults to 963 + # on Debian, 999 on Ubuntu — override via env if yours differs). + - /var/run/docker.sock:/var/run/docker.sock + group_add: + - "${DOCKER_GID:-963}" extra_hosts: # Lets the container reach local services on the Docker host, including # Ollama at http://host.docker.internal:11434. diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index 8a3ab4bb6..fc0e87a08 100644 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -26,6 +26,27 @@ if ! getent passwd "$PUID" >/dev/null 2>&1; then useradd -u "$PUID" -g "$PGID" -M -s /bin/sh -d /app odysseus fi +ODY_USER="$(getent passwd "$PUID" | cut -d: -f1)" +[ -z "$ODY_USER" ] && ODY_USER=odysseus + +# Docker-socket group plumbing. When /var/run/docker.sock is bind-mounted +# (Cookbook uses docker exec to reach sibling containers), the socket is +# owned by root:. Add the app user to that group and later +# call gosu by username so supplementary groups are retained. +DOCKER_SOCK="${DOCKER_SOCK:-/var/run/docker.sock}" +if [ -S "$DOCKER_SOCK" ]; then + SOCK_GID="$(stat -c '%g' "$DOCKER_SOCK" 2>/dev/null || echo '')" + if [ -n "$SOCK_GID" ] && [ "$SOCK_GID" != "0" ]; then + if ! getent group "$SOCK_GID" >/dev/null 2>&1; then + groupadd -g "$SOCK_GID" docker_host || true + fi + SOCK_GROUP="$(getent group "$SOCK_GID" | cut -d: -f1)" + if [ -n "$SOCK_GROUP" ]; then + usermod -aG "$SOCK_GROUP" "$ODY_USER" 2>/dev/null || true + fi + fi +fi + mount_root_for() { awk -v target="$1" '$5 == target { print $4; exit }' /proc/self/mountinfo 2>/dev/null || true } @@ -103,6 +124,7 @@ for cu in \ break fi done + # Disable the FlashInfer JIT sampler unconditionally — it is sampler-only # and has no impact on the attention path, but requires nvcc + matching # CUDA headers at startup. Without this, vLLM crashes with "Could not find @@ -116,9 +138,9 @@ export PATH="/app/.local/bin:$PATH" # Run first-time setup as the app user so data/ files get the right ownership. # setup.py is idempotent — skips auth.json / .env if they already exist. # || true so a setup failure never prevents the container from starting. -"$GOSU_BIN" "$PUID:$PGID" "$PYTHON_BIN" /app/setup.py || true +"$GOSU_BIN" "$ODY_USER" "$PYTHON_BIN" /app/setup.py || true # Drop root and run the actual app. `gosu` is preferred over `su` / # `sudo` because it cleans up the process tree (no extra shell layer) # so signals (SIGTERM from `docker stop`) reach uvicorn directly. -exec "$GOSU_BIN" "$PUID:$PGID" "$@" +exec "$GOSU_BIN" "$ODY_USER" "$@" diff --git a/routes/chat_helpers.py b/routes/chat_helpers.py index 880b4a5eb..06c92ac6b 100644 --- a/routes/chat_helpers.py +++ b/routes/chat_helpers.py @@ -22,6 +22,31 @@ from fastapi import HTTPException logger = logging.getLogger(__name__) +_CASUAL_OPENING_RE = re.compile( + r"^\s*(?:h+i+|hey+|hello+|yo+|sup+|what'?s up|wass?up|hiya|howdy|" + r"lol|lmao|haha+|hehe+|thanks?|thank you|ty|idk|dunno|meh|bruh|bro)\b(?P.*)$", + re.IGNORECASE, +) +_CASUAL_BLOCKLIST_RE = re.compile( + r"\b(?:cookbook|serve|serving|launch|start|vllm|sglang|llama\.?cpp|ollama|" + r"download|model|email|document|doc|note|calendar|task|search|web|research|" + r"file|folder|repo|git|settings?|endpoint|api|token|mcp)\b", + re.IGNORECASE, +) + + +def _is_casual_low_signal(text: str) -> bool: + """Short greetings/slang should not pull memory, skills, RAG, or docs.""" + s = str(text or "").strip() + m = _CASUAL_OPENING_RE.match(s) + if not m: + return False + tail = m.group("tail") or "" + if _CASUAL_BLOCKLIST_RE.search(tail): + return False + tail_words = re.findall(r"[A-Za-z0-9_'-]+", tail) + return len(tail_words) <= 2 + # Strong references to in-flight fire-and-forget tasks scheduled from this # module. asyncio only keeps weak references to tasks created via @@ -588,6 +613,7 @@ async def build_chat_context( # bearer-token chat requests use the token owner instead of the "api" sentinel. user = effective_user(request) uprefs = load_prefs_for_user(user) + casual_low_signal = _is_casual_low_signal(message) # Memory enabled? mem_enabled = not incognito and not no_memory and uprefs.get("memory_enabled", True) @@ -597,6 +623,9 @@ async def build_chat_context( if not allow_tool_preprocessing: mem_enabled = False skills_enabled = False + if casual_low_signal: + mem_enabled = False + skills_enabled = False logger.debug( "Memory enabled=%s for user=%s (incognito=%s, no_memory=%s, pref=%s)", mem_enabled, user, incognito, no_memory, uprefs.get("memory_enabled", "NOT_SET"), @@ -612,11 +641,11 @@ async def build_chat_context( # Use RAG? use_rag_val = (str(use_rag).lower() != "false") if use_rag is not None else True - if incognito or not allow_tool_preprocessing or is_research_spinoff: + if incognito or not allow_tool_preprocessing or is_research_spinoff or casual_low_signal: use_rag_val = False # If pre-fetched search context was provided (compare mode), skip live web search - skip_web = bool(search_context) or not allow_tool_preprocessing + skip_web = bool(search_context) or not allow_tool_preprocessing or casual_low_signal # Build context preface # The stream path uses enhanced_message (with CoT/preprocessing applied), @@ -635,7 +664,7 @@ async def build_chat_context( incognito=incognito, use_skills=skills_enabled, ) - if use_rag is not None or is_research_spinoff: + if use_rag is not None or is_research_spinoff or casual_low_signal: _preface_kwargs["use_rag"] = use_rag_val preface, rag_sources, web_sources = chat_processor.build_context_preface(**_preface_kwargs) @@ -643,7 +672,7 @@ async def build_chat_context( used_memories = getattr(chat_processor, '_last_used_memories', []) # Inject pre-fetched search context (compare mode) - if search_context and allow_tool_preprocessing: + if search_context and allow_tool_preprocessing and not casual_low_signal: preface.append(untrusted_context_message("prefetched search context", search_context)) # YouTube transcripts diff --git a/routes/chat_routes.py b/routes/chat_routes.py index 7fb328ec7..b4a6ed837 100644 --- a/routes/chat_routes.py +++ b/routes/chat_routes.py @@ -829,7 +829,11 @@ def setup_chat_routes( from src.settings import get_setting _global_disabled = get_setting("disabled_tools", []) if _global_disabled and isinstance(_global_disabled, list): - disabled_tools.update(_global_disabled) + explicit_web_allowed = allow_web_search is not None and str(allow_web_search).lower() == "true" + if explicit_web_allowed: + disabled_tools.update(t for t in _global_disabled if t not in {"web_search", "web_fetch"}) + else: + disabled_tools.update(_global_disabled) # Light auto-escalation: the user is in chat mode and just expressed a # notes/calendar/email intent. Grant the relevant managers but withhold @@ -1259,6 +1263,10 @@ def setup_chat_routes( _max_rounds = _DEFAULT_ROUNDS _max_rounds = max(1, min(_max_rounds, 200)) + _forced_tools = None + if allow_web_search is not None and str(allow_web_search).lower() == "true": + _forced_tools = {"web_search", "web_fetch"} + async for chunk in stream_agent_loop( sess.endpoint_url, sess.model, @@ -1280,6 +1288,7 @@ def setup_chat_routes( plan_mode=plan_mode, approved_plan=approved_plan or None, workspace=workspace or None, + forced_tools=_forced_tools, ): if chunk.startswith("data: ") and not chunk.startswith("data: [DONE]"): try: diff --git a/routes/cookbook_helpers.py b/routes/cookbook_helpers.py index cc2daebdb..51f019edb 100644 --- a/routes/cookbook_helpers.py +++ b/routes/cookbook_helpers.py @@ -786,25 +786,149 @@ def _append_llama_cpp_linux_accel_build_lines(runner_lines: list[str]) -> None: to hard-wire CUDA on Linux. That made ROCm hosts attempt a CUDA configure and fail with "CUDA Toolkit not found" instead of building with HIP. """ + # Try a prebuilt binary from llama.cpp's GitHub releases FIRST — no + # cmake/build-essential/git/CUDA-headers needed at all. The from-source + # build below stays as a fallback (custom flags, esoteric arch, no + # internet, etc). 30 seconds vs 5+ minutes of compile, and removes + # every OS-package dep from the launch path. Sets _odysseus_have_prebuilt=1 + # on success; the existing build-tier if/elif chain below is gated on + # that variable so we never compile twice or shadow the prebuilt symlink. + runner_lines.append(' _odysseus_have_prebuilt=""') + runner_lines.append(' _odysseus_arch="$(uname -m)"') + runner_lines.append(' _odysseus_prebuilt_url=""') + runner_lines.append(' if command -v curl >/dev/null 2>&1 && [ "$_odysseus_arch" = "x86_64" ]; then') + runner_lines.append(' _odysseus_pat=""') + runner_lines.append(' _odysseus_has_nv_inline() { command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi -L 2>/dev/null | grep -q "GPU "; }') + runner_lines.append(' _odysseus_has_vk_inline() { ldconfig -p 2>/dev/null | grep -q "libvulkan\\.so" || command -v vulkaninfo >/dev/null 2>&1 || [ -e /usr/lib/x86_64-linux-gnu/libvulkan.so.1 ]; }') + runner_lines.append(' _odysseus_has_vkdev_inline() { ls /dev/dri/renderD* >/dev/null 2>&1 || (lspci 2>/dev/null | grep -Ei \'VGA|3D|Display\' | grep -Eiq \'AMD|ATI|Radeon\'); }') + runner_lines.append(' if _odysseus_has_nv_inline; then') + runner_lines.append(' _odysseus_pat="ubuntu.*cuda"') + runner_lines.append(' elif _odysseus_has_vkdev_inline && _odysseus_has_vk_inline; then') + runner_lines.append(' _odysseus_pat="ubuntu.*vulkan"') + runner_lines.append(' else') + runner_lines.append(' _odysseus_pat="ubuntu-x64\\\\.zip"') + runner_lines.append(' fi') + runner_lines.append(' _odysseus_prebuilt_url="$(curl -fsSL --max-time 15 https://api.github.com/repos/ggml-org/llama.cpp/releases/latest 2>/dev/null | grep \'"browser_download_url"\' | cut -d\'"\' -f4 | grep -iE "$_odysseus_pat" | grep -iv "arm\\|aarch64" | head -1)"') + runner_lines.append(' fi') + # Accept any of unzip / bsdtar / python3 -m zipfile as the extractor. + # python3 is essentially always present on modern Linux, so this lets + # the prebuilt path work on minimal Ubuntu installs that lack `unzip`. + runner_lines.append(' if [ -n "$_odysseus_prebuilt_url" ] && (command -v unzip >/dev/null 2>&1 || command -v bsdtar >/dev/null 2>&1 || command -v python3 >/dev/null 2>&1); then') + runner_lines.append(' echo "[odysseus] Found prebuilt llama-server: $_odysseus_prebuilt_url"') + runner_lines.append(' mkdir -p ~/bin "$HOME/.cache/odysseus/llama-cpp-prebuilt" && cd "$HOME/.cache/odysseus/llama-cpp-prebuilt"') + runner_lines.append(' rm -f llama-cpp.zip') + runner_lines.append(' if curl -fsSL --max-time 120 "$_odysseus_prebuilt_url" -o llama-cpp.zip && [ -s llama-cpp.zip ]; then') + runner_lines.append(' rm -rf build && mkdir -p build') + runner_lines.append(' if command -v unzip >/dev/null 2>&1; then unzip -qq -o llama-cpp.zip -d build; elif command -v bsdtar >/dev/null 2>&1; then bsdtar -xf llama-cpp.zip -C build; else python3 -c "import zipfile; zipfile.ZipFile(\\"llama-cpp.zip\\").extractall(\\"build\\")"; fi') + runner_lines.append(' _odysseus_extracted="$(find build -type f -name llama-server 2>/dev/null | head -1)"') + runner_lines.append(' if [ -n "$_odysseus_extracted" ]; then') + runner_lines.append(' chmod +x "$_odysseus_extracted"') + runner_lines.append(' ln -sf "$_odysseus_extracted" ~/bin/llama-server') + runner_lines.append(' _odysseus_libdir="$(dirname "$_odysseus_extracted")"') + runner_lines.append(' mkdir -p ~/.config && echo "export LD_LIBRARY_PATH=\\"$_odysseus_libdir:\\${LD_LIBRARY_PATH:-}\\"" > ~/.config/odysseus-llama-cpp-env') + runner_lines.append(' _odysseus_have_prebuilt=1') + runner_lines.append(' echo "[odysseus] Prebuilt llama-server installed at $_odysseus_extracted"') + runner_lines.append(' fi') + runner_lines.append(' fi') + runner_lines.append(' [ -z "$_odysseus_have_prebuilt" ] && echo "[odysseus] Prebuilt download/extract failed — falling back to from-source build."') + runner_lines.append(' elif [ -z "$_odysseus_prebuilt_url" ]; then') + runner_lines.append(' echo "[odysseus] No matching prebuilt llama-server for this host (arch=$_odysseus_arch) — will build from source."') + runner_lines.append(' fi') + runner_lines.append(' if [ -z "$_odysseus_have_prebuilt" ]; then') # Detect pip-installed nvcc (from vLLM/nvidia CUDA wheels) and put it on PATH - # so cmake's CUDA configure can find it. We keep this after the ROCm/HIP - # check — a machine with both stacks should honor the native HIP toolchain on - # AMD hosts instead of accidentally preferring a stray nvcc wheel. - runner_lines.append(' for _cudir in ~/.local/lib/python*/site-packages/nvidia/cu13 ~/.local/lib/python*/site-packages/nvidia/cu12 ~/.local/lib/python*/site-packages/nvidia/cuda_nvcc; do') - runner_lines.append(' [ -x "$_cudir/bin/nvcc" ] && export CUDA_HOME="$_cudir" && export PATH="$_cudir/bin:$PATH" && break') - runner_lines.append(' done') + # so cmake's CUDA configure can find it — BUT only when actual NVIDIA + # hardware is present. On AMD/Intel hosts the pip nvcc is a misleading + # leftover (no libcudart, no GPU it could target) and would otherwise + # send the build down the CUDA branch and fail with "CUDA Toolkit not + # found" instead of trying Vulkan. + runner_lines.append(' _odysseus_has_nvidia_hw() {') + runner_lines.append(' command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi -L 2>/dev/null | grep -q "GPU " && return 0') + runner_lines.append(' ls /dev/nvidia* >/dev/null 2>&1 && return 0') + runner_lines.append(' lspci 2>/dev/null | grep -iE \'VGA|3D|Display\' | grep -iq nvidia && return 0') + runner_lines.append(' return 1') + runner_lines.append(' }') + runner_lines.append(' if _odysseus_has_nvidia_hw; then') + runner_lines.append(' for _cudir in ~/.local/lib/python*/site-packages/nvidia/cu13 ~/.local/lib/python*/site-packages/nvidia/cu12 ~/.local/lib/python*/site-packages/nvidia/cuda_nvcc; do') + runner_lines.append(' [ -x "$_cudir/bin/nvcc" ] && export CUDA_HOME="$_cudir" && export PATH="$_cudir/bin:$PATH" && break') + runner_lines.append(' done') + runner_lines.append(' fi') # rm -rf build so a prior poisoned CMakeCache.txt (e.g. from a failed CUDA # or HIP attempt) doesn't cause the next configure to reuse stale settings. runner_lines.append(' mkdir -p ~/bin') - runner_lines.append(' cd ~/llama.cpp && rm -rf build') + # Try to install cmake / build-essential / git automatically before the + # build, but ONLY via passwordless sudo (`sudo -n`) — interactive sudo + # would hang a tmux-backgrounded serve task waiting for a password. If + # sudo asks for a password the install is skipped silently and the + # diagnosis pattern (cookbook_routes.py / cookbook_helpers.py) surfaces + # an explicit "install cmake" suggestion in the Cookbook diagnosis + # toolbar after the inevitable build failure. + runner_lines.append(' _odysseus_apt_bootstrap() {') + runner_lines.append(' local _missing=""') + runner_lines.append(' command -v cmake >/dev/null 2>&1 || _missing="$_missing cmake"') + runner_lines.append(' command -v g++ >/dev/null 2>&1 || command -v gcc >/dev/null 2>&1 || _missing="$_missing build-essential"') + runner_lines.append(' command -v git >/dev/null 2>&1 || _missing="$_missing git"') + runner_lines.append(' [ -z "$_missing" ] && return 0') + runner_lines.append(' if command -v apt-get >/dev/null 2>&1 && sudo -n true 2>/dev/null; then') + runner_lines.append(' echo "[odysseus] Auto-installing missing build deps via apt:$_missing"') + runner_lines.append(' sudo -n env DEBIAN_FRONTEND=noninteractive apt-get update -qq 2>&1 | tail -3') + runner_lines.append(' sudo -n env DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends $_missing 2>&1 | tail -5 || true') + runner_lines.append(' elif command -v pacman >/dev/null 2>&1 && sudo -n true 2>/dev/null; then') + runner_lines.append(' echo "[odysseus] Auto-installing missing build deps via pacman:$_missing"') + runner_lines.append(' local _pacpkgs="$(echo "$_missing" | sed -e \'s/build-essential/base-devel/g\')"') + runner_lines.append(' sudo -n pacman -Sy --needed --noconfirm $_pacpkgs 2>&1 | tail -5 || true') + runner_lines.append(' elif command -v dnf >/dev/null 2>&1 && sudo -n true 2>/dev/null; then') + runner_lines.append(' echo "[odysseus] Auto-installing missing build deps via dnf:$_missing"') + runner_lines.append(' local _dnfpkgs="$(echo "$_missing" | sed -e \'s/build-essential/gcc gcc-c++ make/g\')"') + runner_lines.append(' sudo -n dnf install -y $_dnfpkgs 2>&1 | tail -5 || true') + runner_lines.append(' else') + runner_lines.append(' echo "[odysseus] WARNING: missing build deps ($_missing) — passwordless sudo is unavailable, cannot auto-install. Cookbook Diagnosis will explain the fix after the build fails."') + runner_lines.append(' fi') + runner_lines.append(' }') + runner_lines.append(' _odysseus_apt_bootstrap') + runner_lines.append(' _odysseus_missing_build_deps=""') + runner_lines.append(' command -v cmake >/dev/null 2>&1 || _odysseus_missing_build_deps="$_odysseus_missing_build_deps cmake"') + runner_lines.append(' command -v git >/dev/null 2>&1 || _odysseus_missing_build_deps="$_odysseus_missing_build_deps git"') + runner_lines.append(' command -v g++ >/dev/null 2>&1 || command -v gcc >/dev/null 2>&1 || _odysseus_missing_build_deps="$_odysseus_missing_build_deps build-essential"') + runner_lines.append(' if [ -n "$_odysseus_missing_build_deps" ]; then') + runner_lines.append(' echo "ERROR: llama.cpp source build needs missing packages:$_odysseus_missing_build_deps"') + runner_lines.append(' if command -v apt-get >/dev/null 2>&1; then') + runner_lines.append(' echo "Install on this host: sudo apt-get update && sudo apt-get install -y cmake build-essential git"') + runner_lines.append(' elif command -v pacman >/dev/null 2>&1; then') + runner_lines.append(' echo "Install on this host: sudo pacman -Sy --needed cmake base-devel git"') + runner_lines.append(' elif command -v dnf >/dev/null 2>&1; then') + runner_lines.append(' echo "Install on this host: sudo dnf install -y cmake gcc gcc-c++ make git"') + runner_lines.append(' fi') + runner_lines.append(' echo "Alternative: install a native llama-server on PATH, then relaunch."') + runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127') + runner_lines.append(' fi') + runner_lines.append(' cd ~/llama.cpp') + runner_lines.append(' _odysseus_has_vulkan() {') + runner_lines.append(' ldconfig -p 2>/dev/null | grep -q \'libvulkan\\.so\' && return 0') + runner_lines.append(' [ -e /usr/lib/libvulkan.so.1 ] && return 0') + runner_lines.append(' [ -e /usr/lib/x86_64-linux-gnu/libvulkan.so.1 ] && return 0') + runner_lines.append(' command -v vulkaninfo >/dev/null 2>&1 && return 0') + runner_lines.append(' return 1') + runner_lines.append(' }') + runner_lines.append(' _odysseus_has_vulkan_device() {') + runner_lines.append(' ls /dev/dri/renderD* >/dev/null 2>&1 && return 0') + runner_lines.append(' lspci 2>/dev/null | grep -Ei \'VGA|3D|Display\' | grep -Eiq \'AMD|ATI|Radeon\' && return 0') + runner_lines.append(' return 1') + runner_lines.append(' }') + # Backend preference: native ROCm/HIP > native CUDA > Vulkan > CPU. + # Vulkan is a portable fallback that works on AMD when ROCm isn't + # installed (e.g. Strix Halo) and on any vendor's discrete GPU, but + # it's ~30-40% slower than native HIP/CUDA for LLM inference — only + # pick it when no native toolchain is present. runner_lines.append(' if command -v hipconfig &>/dev/null || [ -d /opt/rocm ] || [ -n "$ROCM_PATH" ] || [ -n "$HIP_PATH" ]; then') + runner_lines.append(' rm -rf build') runner_lines.append(' if command -v hipconfig &>/dev/null; then') runner_lines.append(' export HIPCXX="${HIPCXX:-$(hipconfig -l)/clang}"') runner_lines.append(' export HIP_PATH="${HIP_PATH:-$(hipconfig -R)}"') runner_lines.append(' fi') runner_lines.append(' echo "[odysseus] ROCm/HIP detected — building llama-server with HIP support..."') runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_HIP=ON && cmake --build build -j"$NPROC" --target llama-server && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server') - runner_lines.append(' elif command -v nvcc &>/dev/null; then') + runner_lines.append(' elif command -v nvcc &>/dev/null && _odysseus_has_nvidia_hw; then') + runner_lines.append(' rm -rf build') # nvcc alone is not sufficient — pip-installed CUDA wheels or incomplete # tooling can expose nvcc without shipping libcudart, causing cmake to fail # mid-build with "CUDA runtime library not found". Check cudart explicitly @@ -828,31 +952,50 @@ def _append_llama_cpp_linux_accel_build_lines(runner_lines: list[str]) -> None: runner_lines.append(' echo "[odysseus] Ensure libcudart is installed (e.g. cuda-runtime package) and visible via ldconfig or CUDA_HOME."') runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release && cmake --build build -j"$NPROC" --target llama-server && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server') runner_lines.append(' fi') + runner_lines.append(' elif _odysseus_has_vulkan_device && _odysseus_has_vulkan; then') + runner_lines.append(' echo "[odysseus] Vulkan-capable GPU detected (no ROCm/CUDA toolchain installed) — building llama-server with Vulkan support..."') + runner_lines.append(' rm -rf build-vulkan') + runner_lines.append(' cmake -B build-vulkan -DCMAKE_BUILD_TYPE=Release -DGGML_VULKAN=ON && cmake --build build-vulkan -j"$NPROC" --target llama-server && ln -sf ~/llama.cpp/build-vulkan/bin/llama-server ~/bin/llama-server') runner_lines.append(' else') - runner_lines.append(' echo "[odysseus] WARNING: no HIP/CUDA toolchain found — building llama-server for CPU only."') + runner_lines.append(' echo "[odysseus] WARNING: no HIP/CUDA/Vulkan toolchain found — building llama-server for CPU only."') runner_lines.append(' echo "[odysseus] GPU inference will not be available for this llama.cpp build."') - runner_lines.append(' echo "[odysseus] Install ROCm for AMD GPUs or vLLM/CUDA tooling for NVIDIA, then re-launch this serve task."') + runner_lines.append(' echo "[odysseus] Install Vulkan (libvulkan-dev) / ROCm for AMD GPUs or CUDA tooling for NVIDIA, then re-launch this serve task."') + runner_lines.append(' rm -rf build') runner_lines.append(' cmake -B build -DCMAKE_BUILD_TYPE=Release && cmake --build build -j"$NPROC" --target llama-server && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server') runner_lines.append(' fi') + runner_lines.append(' fi # end _odysseus_have_prebuilt guard') -def _llama_cpp_rebuild_cmd() -> str: +def _llama_cpp_rebuild_cmd(update_source: bool = False) -> str: """Shell command that clears the Cookbook-managed llama.cpp build. - Removes the cached ``llama-server`` symlink and the ``~/llama.cpp/build`` + Removes the cached ``llama-server`` symlink and the ``~/llama.cpp/build*`` directory so the next llama.cpp serve recompiles from source, picking up a CUDA or HIP toolchain if one is now available. The serve bootstrap only builds when ``llama-server`` is missing from PATH, so without this an - existing CPU-only build is reused forever. It deliberately installs and - downloads nothing; the rebuild itself happens on the next serve. + existing CPU-only build is reused forever. When ``update_source`` is true, + the command also fast-forwards the Cookbook-managed ``~/llama.cpp`` checkout + if it exists. The rebuild itself happens on the next serve. """ + update_cmd = '' + if update_source: + update_cmd = ( + 'if [ -d "$HOME/llama.cpp/.git" ]; then ' + 'git -C "$HOME/llama.cpp" pull --ff-only --depth 1 || ' + 'echo "[odysseus] WARNING: llama.cpp source update failed; clearing cached build anyway."; ' + 'elif command -v git >/dev/null 2>&1; then ' + 'git clone --depth 1 https://github.com/ggml-org/llama.cpp "$HOME/llama.cpp" || ' + 'echo "[odysseus] WARNING: llama.cpp clone failed; clearing cached build anyway."; ' + 'fi && ' + ) return ( 'mkdir -p "$HOME/bin" && ' + f'{update_cmd}' 'rm -f "$HOME/bin/llama-server" && ' - 'rm -rf "$HOME/llama.cpp/build" && ' + 'rm -rf "$HOME/llama.cpp/build" "$HOME/llama.cpp/build-vulkan" && ' 'echo "[odysseus] Cleared the cached llama.cpp build. ' 'Re-launch the serve task to rebuild llama-server from source ' - '(CUDA or HIP will be used if a toolchain is now available)."' + '(Vulkan, HIP, or CUDA will be used if a matching toolchain is now available)."' ) @@ -1115,8 +1258,27 @@ def _diagnose_serve_output(text: str) -> dict | None: "SGLang is not installed or not in PATH on this server.", [{"label": "install SGLang in Cookbook Dependencies", "op": "dependency", "package": "sglang[all]"}], ), + # System build deps come BEFORE the generic llama.cpp catch-all so + # cmake / build-essential / git missing → a specific OS-package + # remediation instead of "install llama-cpp-python[server]" (which + # itself fails to compile when cmake is absent). ( - r"llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'|git: command not found|cmake: command not found", + r"cmake: command not found|cmake.*not found.*[Cc]ould not", + "cmake is required to build llama.cpp from source but isn't installed on this server.", + [{"label": "install build deps for llama.cpp (apt: cmake build-essential git / pacman: cmake base-devel git / dnf: cmake gcc-c++ make git / brew: cmake git)", "op": "dependency", "package": "llama-cpp-python[server]"}], + ), + ( + r"^(make|g\+\+|gcc): command not found|Could not find C\+\+ compiler", + "A C/C++ compiler (build-essential) is required to build llama.cpp from source.", + [{"label": "install build deps for llama.cpp on this server", "op": "dependency", "package": "llama-cpp-python[server]"}], + ), + ( + r"^git: command not found", + "git is required to clone the llama.cpp source tree.", + [{"label": "install build deps for llama.cpp on this server", "op": "dependency", "package": "llama-cpp-python[server]"}], + ), + ( + r"llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'", "llama.cpp / llama-cpp-python dependencies are missing.", [{"label": "install llama.cpp dependencies or llama-cpp-python[server]", "op": "dependency", "package": "llama-cpp-python[server]"}], ), diff --git a/routes/cookbook_routes.py b/routes/cookbook_routes.py index ea15a22c3..f57ecf6e5 100644 --- a/routes/cookbook_routes.py +++ b/routes/cookbook_routes.py @@ -189,8 +189,27 @@ def setup_cookbook_routes() -> APIRouter: "SGLang is not installed or not in PATH on this server.", [{"label": "install SGLang in Cookbook Dependencies", "op": "dependency", "package": "sglang[all]"}], ), + # System build deps come BEFORE the generic llama.cpp catch-all + # so cmake / build-essential / git missing → a specific OS-package + # remediation instead of "install llama-cpp-python[server]" (which + # itself fails to compile when cmake is absent). ( - r"llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'|git: command not found|cmake: command not found", + r"cmake: command not found|cmake.*not found.*[Cc]ould not", + "cmake is required to build llama.cpp from source but isn't installed on this server.", + [{"label": "install build deps for llama.cpp (apt: cmake build-essential git / pacman: cmake base-devel git / dnf: cmake gcc-c++ make git / brew: cmake git)", "op": "dependency", "package": "llama-cpp-python[server]"}], + ), + ( + r"^(make|g\+\+|gcc): command not found|Could not find C\+\+ compiler", + "A C/C++ compiler (build-essential) is required to build llama.cpp from source.", + [{"label": "install build deps for llama.cpp on this server", "op": "dependency", "package": "llama-cpp-python[server]"}], + ), + ( + r"^git: command not found", + "git is required to clone the llama.cpp source tree.", + [{"label": "install build deps for llama.cpp on this server", "op": "dependency", "package": "llama-cpp-python[server]"}], + ), + ( + r"llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'", "llama.cpp / llama-cpp-python dependencies are missing.", [{"label": "install llama.cpp dependencies or llama-cpp-python[server]", "op": "dependency", "package": "llama-cpp-python[server]"}], ), @@ -254,6 +273,79 @@ def setup_cookbook_routes() -> APIRouter: def _load_stored_hf_token() -> str: return load_stored_hf_token(state_path=_cookbook_state_path) + def _normalize_minimax_m3_vllm_cmd(cmd: str) -> str: + """Patch MiniMax M3 vLLM launches into the known-good local form. + + The browser form can be stale or omit advanced-only fields. MiniMax M3 + is sensitive to several flags: using the HF repo id with block-size 128 + fails KV-cache setup, and FlashInfer sampler JIT fails on this host's + system nvcc. Normalize server-side before writing the tmux runner. + """ + cmd_lower = (cmd or "").lower() + if not cmd or "vllm serve" not in cmd_lower or "minimax" not in cmd_lower or "m3" not in cmd_lower: + return cmd + try: + parts = shlex.split(cmd) + except ValueError: + return cmd + if "serve" not in parts: + return cmd + + env_re = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*=") + env_parts = [p for p in parts if env_re.match(p)] + body = [p for p in parts if not env_re.match(p)] + try: + serve_i = body.index("serve") + except ValueError: + return cmd + if serve_i + 1 >= len(body): + return cmd + + repo_id = "cyankiwi/MiniMax-M3-AWQ-INT4" + snapshot = ( + "/home/pewds/.cache/huggingface/hub/" + "models--cyankiwi--MiniMax-M3-AWQ-INT4/" + "snapshots/4082acbbec1236d21828d55b6bb0fe02ade4ab5b" + ) + if body[serve_i + 1] == repo_id: + body[serve_i + 1] = snapshot + + def add_env(key: str, value: str) -> None: + if not any(p.startswith(f"{key}=") for p in env_parts): + env_parts.append(f"{key}={value}") + + def has_flag(flag: str) -> bool: + return any(p == flag or p.startswith(flag + "=") for p in body) + + def set_flag(flag: str, value: str) -> None: + for i, part in enumerate(body): + if part == flag: + if i + 1 < len(body): + body[i + 1] = value + else: + body.append(value) + return + if part.startswith(flag + "="): + body[i] = f"{flag}={value}" + return + body.extend([flag, value]) + + def add_bool(flag: str) -> None: + if not has_flag(flag): + body.append(flag) + + add_env("VLLM_TARGET_DEVICE", "cuda") + add_env("VLLM_USE_FLASHINFER_SAMPLER", "0") + set_flag("--served-model-name", repo_id) + set_flag("--tool-call-parser", "minimax_m3") + set_flag("--reasoning-parser", "minimax_m3") + set_flag("--attention-backend", "TRITON_ATTN") + set_flag("--block-size", "128") + add_bool("--language-model-only") + add_bool("--disable-custom-all-reduce") + add_bool("--enable-expert-parallel") + return shlex.join(env_parts + body) + def _cookbook_ssh_dir() -> Path: # The Docker image keeps cookbook keys under /app/.ssh; that path only # exists inside the container. On Windows (and any non-container host) @@ -1230,6 +1322,7 @@ def setup_cookbook_routes() -> APIRouter: # `TypeError: argument of type 'NoneType'` (a 500 instead of a clean 400). req.cmd = _validate_serve_cmd(req.cmd) or "" req.cmd = _normalize_llama_cpp_python_cache_types(req.cmd) or "" + req.cmd = _normalize_minimax_m3_vllm_cmd(req.cmd) req.cmd = _venv_safe_local_pip_install_cmd( req.cmd, local=not bool(req.remote_host), @@ -1243,8 +1336,16 @@ def setup_cookbook_routes() -> APIRouter: req.cmd = _pip_install_no_cache(req.cmd) # Accept common aliases and enforce server extras for llama-cpp so # `python -m llama_cpp.server` has all runtime dependencies. - req.cmd = re.sub(r"(? APIRouter: runner_lines.append(' else') _append_llama_cpp_linux_accel_build_lines(runner_lines) runner_lines.append(' fi') + # Source the env file the prebuilt-download path writes so + # LD_LIBRARY_PATH includes the directory holding libllama.so + # and friends. No-op when prebuilt wasn't used. + runner_lines.append(' [ -r ~/.config/odysseus-llama-cpp-env ] && . ~/.config/odysseus-llama-cpp-env') + # Auto-upgrade pip llama-cpp-python to the CUDA-enabled + # wheel when (a) NVIDIA hardware is present and (b) the + # currently-installed wheel is CPU-only. Without this the + # user gets the Python server happily running at 3 tok/s + # because pip's default index ships CPU-only wheels. + # Forward-compat: cu124 wheels work on driver/runtime + # 12.4+ including the cu13.x line. + runner_lines.append(' if command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi -L 2>/dev/null | grep -q "GPU " && python3 -c "import llama_cpp" 2>/dev/null; then') + runner_lines.append(' if ! python3 -c "import llama_cpp; import sys; sys.exit(0 if llama_cpp.llama_supports_gpu_offload() else 1)" 2>/dev/null; then') + runner_lines.append(' echo "[odysseus] NVIDIA detected but installed llama-cpp-python is CPU-only — reinstalling with CUDA wheel index for GPU offload..."') + runner_lines.append(' python3 -m pip install --user --break-system-packages --force-reinstall --no-cache-dir "llama-cpp-python[server]" --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124 2>&1 | tail -8 || echo "[odysseus] WARNING: CUDA wheel reinstall failed — Python server will stay CPU-only (slow). Manual fix: pip install --user --force-reinstall \'llama-cpp-python[server]\' --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124"') + runner_lines.append(' if python3 -c "import llama_cpp; import sys; sys.exit(0 if llama_cpp.llama_supports_gpu_offload() else 1)" 2>/dev/null; then') + runner_lines.append(' echo "[odysseus] llama-cpp-python now supports GPU offload."') + runner_lines.append(' fi') + runner_lines.append(' fi') + runner_lines.append(' fi') + # SHORT-CIRCUIT before the build/pip fallback: if the + # native binary is missing but llama_cpp Python is already + # installed, drop a wrapper at ~/bin/llama-server that + # translates llama-server CLI args to llama_cpp.server's + # underscore-style flags. The user's serve command stays + # `llama-server ...` and "just works" — no build, no cmake, + # no second install. This is the path that unblocks every + # remote where pip-installed llama-cpp-python is already + # working but Cookbook used to insist on a native binary. + runner_lines.append(' if ! command -v llama-server >/dev/null 2>&1 && python3 -c "import llama_cpp" 2>/dev/null; then') + runner_lines.append(' mkdir -p ~/bin') + runner_lines.append(' cat > ~/bin/llama-server <<\'_ODY_LLAMA_SHIM_EOF\'') + runner_lines.append('#!/usr/bin/env bash') + runner_lines.append('# Auto-generated by Odysseus Cookbook: a `llama-server` lookalike') + runner_lines.append('# that translates the native CLI to `python -m llama_cpp.server`.') + runner_lines.append('# Lets cookbook-generated launch commands run unchanged on hosts') + runner_lines.append('# where only the pip llama-cpp-python package is installed.') + runner_lines.append('ARGS=()') + runner_lines.append('while [ $# -gt 0 ]; do') + runner_lines.append(' case "$1" in') + runner_lines.append(' -ngl|--gpu-layers|--n-gpu-layers) ARGS+=(--n_gpu_layers "$2"); shift 2 ;;') + runner_lines.append(' -c|--ctx-size) ARGS+=(--n_ctx "$2"); shift 2 ;;') + runner_lines.append(' -b|--batch-size) ARGS+=(--n_batch "$2"); shift 2 ;;') + runner_lines.append(' -ub|--ubatch-size) shift 2 ;; # llama-cpp-python has no separate ubatch') + runner_lines.append(' --flash-attn) ARGS+=(--flash_attn true); shift 2 ;;') + runner_lines.append(' --cache-type-k) ARGS+=(--type_k "$2"); shift 2 ;;') + runner_lines.append(' --cache-type-v) ARGS+=(--type_v "$2"); shift 2 ;;') + runner_lines.append(' --n-cpu-moe) ARGS+=(--n_cpu_moe "$2"); shift 2 ;;') + runner_lines.append(' --mmproj) ARGS+=(--clip_model_path "$2"); shift 2 ;;') + runner_lines.append(' --image-max-tokens) shift 2 ;; # native-only') + runner_lines.append(' --no-mmap) ARGS+=(--no_mmap true); shift ;;') + runner_lines.append(' --no-warmup) shift ;; # native-only') + runner_lines.append(' --chat-template) ARGS+=(--chat_format "$2"); shift 2 ;;') + runner_lines.append(' --fit|--split-mode|--tensor-split|--main-gpu|--parallel) shift 2 ;; # native-only') + runner_lines.append(' --mlock) ARGS+=(--use_mlock true); shift ;;') + runner_lines.append(' *) ARGS+=("$1"); shift ;;') + runner_lines.append(' esac') + runner_lines.append('done') + runner_lines.append('exec python3 -m llama_cpp.server "${ARGS[@]}"') + runner_lines.append('_ODY_LLAMA_SHIM_EOF') + runner_lines.append(' chmod +x ~/bin/llama-server') + runner_lines.append(' echo "[odysseus] Created llama-server shim → python -m llama_cpp.server (no native binary needed)"') + runner_lines.append(' fi') runner_lines.append(' # If the native build failed, fall back to the Python bindings.') runner_lines.append(' if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then') runner_lines.append(' echo "llama-server build failed — installing Python bindings as fallback..."') @@ -1494,6 +1658,96 @@ def setup_cookbook_routes() -> APIRouter: runner_lines.append(' echo "ERROR: vLLM is not installed."') runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127') runner_lines.append('fi') + runner_lines.append(f"ODYSSEUS_SERVE_CMD='{_bash_squote(req.cmd)}'") + runner_lines.append('if [ -z "$ODYSSEUS_PREFLIGHT_EXIT" ]; then') + runner_lines.append(' ODYSSEUS_VLLM_HELP_CMD="$(python3 - "$ODYSSEUS_SERVE_CMD" <<\'PY\'') + runner_lines.append('import shlex, sys') + runner_lines.append('parts = shlex.split(sys.argv[1])') + runner_lines.append('try:') + runner_lines.append(' serve_i = parts.index("serve")') + runner_lines.append('except ValueError:') + runner_lines.append(' print("vllm serve --help")') + runner_lines.append('else:') + runner_lines.append(' print(shlex.join(parts[:serve_i + 1] + ["--help"]))') + runner_lines.append('PY') + runner_lines.append(')"') + runner_lines.append(' ODYSSEUS_VLLM_SUPPORTS_SWAP=0') + runner_lines.append(' if eval "$ODYSSEUS_VLLM_HELP_CMD" 2>&1 | grep -q -- "--swap-space"; then ODYSSEUS_VLLM_SUPPORTS_SWAP=1; fi') + runner_lines.append('fi') + runner_lines.append('if [ -z "$ODYSSEUS_PREFLIGHT_EXIT" ] && [ "${ODYSSEUS_VLLM_SUPPORTS_SWAP:-0}" = "1" ] && ! printf "%s" "$ODYSSEUS_SERVE_CMD" | grep -q -- "--swap-space"; then') + runner_lines.append(' echo "[odysseus] Setting vLLM --swap-space 0 so the runtime does not reserve CPU swap per GPU."') + runner_lines.append(' ODYSSEUS_SERVE_CMD="${ODYSSEUS_SERVE_CMD} --swap-space 0"') + runner_lines.append('fi') + runner_lines.append('if [ -z "$ODYSSEUS_PREFLIGHT_EXIT" ] && [ "${ODYSSEUS_VLLM_SUPPORTS_SWAP:-0}" != "1" ]; then') + runner_lines.append(' if printf "%s" "$ODYSSEUS_SERVE_CMD" | grep -q -- "--swap-space"; then') + runner_lines.append(' echo "[odysseus] vLLM serve does not expose --swap-space; removing the flag and patching the runtime default to 0."') + runner_lines.append(' ODYSSEUS_SERVE_CMD="$(python3 - "$ODYSSEUS_SERVE_CMD" <<\'PY\'') + runner_lines.append('import shlex, sys') + runner_lines.append('parts = shlex.split(sys.argv[1])') + runner_lines.append('out = []') + runner_lines.append('skip = False') + runner_lines.append('for part in parts:') + runner_lines.append(' if skip:') + runner_lines.append(' skip = False') + runner_lines.append(' continue') + runner_lines.append(' if part == "--swap-space":') + runner_lines.append(' skip = True') + runner_lines.append(' continue') + runner_lines.append(' if part.startswith("--swap-space="):') + runner_lines.append(' continue') + runner_lines.append(' out.append(part)') + runner_lines.append('print(shlex.join(out))') + runner_lines.append('PY') + runner_lines.append(')"') + runner_lines.append(' fi') + runner_lines.append(' ODYSSEUS_SERVE_CMD="$(python3 - "$ODYSSEUS_SERVE_CMD" <<\'PY\'') + runner_lines.append('import shlex, sys') + runner_lines.append('parts = shlex.split(sys.argv[1])') + runner_lines.append('patch = r"""import inspect, sys') + runner_lines.append('from vllm.engine.arg_utils import EngineArgs, AsyncEngineArgs') + runner_lines.append('def _odysseus_swap0(cls):') + runner_lines.append(' params = list(inspect.signature(cls).parameters)') + runner_lines.append(' if "swap_space" not in params:') + runner_lines.append(' return') + runner_lines.append(' idx = params.index("swap_space")') + runner_lines.append(' defaults = list(cls.__init__.__defaults__ or ())') + runner_lines.append(' if idx < len(defaults):') + runner_lines.append(' defaults[idx] = 0') + runner_lines.append(' cls.__init__.__defaults__ = tuple(defaults)') + runner_lines.append(' fields = getattr(cls, "__dataclass_fields__", {})') + runner_lines.append(' if "swap_space" in fields:') + runner_lines.append(' fields["swap_space"].default = 0') + runner_lines.append('_odysseus_swap0(EngineArgs)') + runner_lines.append('_odysseus_swap0(AsyncEngineArgs)') + runner_lines.append('try:') + runner_lines.append(' from vllm.config import CacheConfig') + runner_lines.append(' CacheConfig.swap_space = 0') + runner_lines.append('except Exception:') + runner_lines.append(' pass') + runner_lines.append('_orig_create_engine_config = EngineArgs.create_engine_config') + runner_lines.append('def _odysseus_create_engine_config(self, *args, **kwargs):') + runner_lines.append(' self.swap_space = 0') + runner_lines.append(' return _orig_create_engine_config(self, *args, **kwargs)') + runner_lines.append('EngineArgs.create_engine_config = _odysseus_create_engine_config') + runner_lines.append('AsyncEngineArgs.create_engine_config = _odysseus_create_engine_config') + runner_lines.append('from vllm.entrypoints.cli.main import main') + runner_lines.append('sys.exit(main())"""') + runner_lines.append('try:') + runner_lines.append(' serve_i = parts.index("serve")') + runner_lines.append('except ValueError:') + runner_lines.append(' print(shlex.join(parts))') + runner_lines.append('else:') + runner_lines.append(' exe_i = serve_i - 1') + runner_lines.append(' exe = parts[exe_i] if exe_i >= 0 else "vllm"') + runner_lines.append(' py = "python3"') + runner_lines.append(' if exe.endswith("/bin/vllm"):') + runner_lines.append(' py = exe[:-len("/bin/vllm")] + "/bin/python"') + runner_lines.append(' parts[exe_i:serve_i] = [py, "-c", patch]') + runner_lines.append(' print(shlex.join(parts))') + runner_lines.append('PY') + runner_lines.append(')"') + runner_lines.append(' echo "[odysseus] Patched vLLM internal swap_space default to 0 for this runtime."') + runner_lines.append('fi') elif "sglang.launch_server" in req.cmd: runner_lines.append('export PATH="$HOME/.local/bin:$PATH"') runner_lines.append('if ! command -v sglang &>/dev/null; then') @@ -1535,7 +1789,10 @@ def setup_cookbook_routes() -> APIRouter: runner_lines, keep_shell_open=not local_windows, ) - runner_lines.append(req.cmd) + if "vllm serve" in req.cmd: + runner_lines.append('eval "$ODYSSEUS_SERVE_CMD"') + else: + runner_lines.append(req.cmd) if local_windows: # Detached background process — no interactive shell to keep open. # Print the exit marker the status poller looks for, then stop. @@ -1839,6 +2096,25 @@ def setup_cookbook_routes() -> APIRouter: out, err = await _run_gpu_shell("ls -1 /sys/class/drm 2>/dev/null", host, ssh_port, timeout=4) if err is not None or not out: return [] + # Pick the runtime label up-front so each GPU dict gets the + # right `backend`. AMD silicon can be driven by ROCm/HIP (native) + # OR Vulkan (mesa RADV). Reporting "rocm" on a host where no + # ROCm toolchain is installed misleads the frontend env-var + # prefix logic — it would emit `HIP_VISIBLE_DEVICES=` for a + # Vulkan-only stack, which is a silent no-op at best. + rt_out, _ = await _run_gpu_shell( + 'command -v rocminfo >/dev/null 2>&1 && echo rocm ' + '|| (command -v hipconfig >/dev/null 2>&1 && echo rocm) ' + '|| (command -v vulkaninfo >/dev/null 2>&1 && echo vulkan) ' + '|| echo unknown', + host, ssh_port, timeout=4, + ) + _amd_runtime = (rt_out or "").strip().splitlines()[-1:][0].strip() if rt_out else "rocm" + if _amd_runtime not in ("rocm", "vulkan"): + # Default to rocm so existing ROCm-installed hosts keep + # working; "unknown" only happens when neither toolchain is + # detected (e.g. minimal sysfs read on a fresh box). + _amd_runtime = "rocm" gpus = [] for entry in out.split(): if not entry.startswith("card") or "-" in entry: @@ -1882,7 +2158,7 @@ def setup_cookbook_routes() -> APIRouter: "free_mb": free_mb, "total_mb": total_mb, "used_mb": used_mb, "gtt_used_mb": gtt_used_mb, "util_pct": 0, "busy": bool(total_mb and (free_mb / total_mb) < 0.85), - "processes": [], "backend": "rocm", "source": "amd-sysfs", + "processes": [], "backend": _amd_runtime, "source": "amd-sysfs", "unified_memory": unified, }) if gpus: @@ -2023,10 +2299,15 @@ def setup_cookbook_routes() -> APIRouter: amd_gpus = await _probe_amd_sysfs(host, ssh_port) if amd_gpus: + # The per-GPU dict already carries the runtime label picked by + # _probe_amd_sysfs (rocm vs vulkan); mirror that into the + # wrapper so the frontend can read `data.backend` directly + # without scanning the list. + _amd_wrap_backend = str(amd_gpus[0].get("backend") or "rocm") return { "ok": True, "gpus": amd_gpus, - "backend": "rocm", + "backend": _amd_wrap_backend, "source": "amd-sysfs", "fallback_from": "nvidia-smi", "nvidia_error": nvidia_error, @@ -2166,6 +2447,17 @@ def setup_cookbook_routes() -> APIRouter: disk_tasks = on_disk.get("tasks") or [] if isinstance(on_disk, dict) else [] incoming_tasks = data.get("tasks") if isinstance(data.get("tasks"), list) else [] + incoming_removed = data.get("removedTasks") if isinstance(data.get("removedTasks"), dict) else {} + disk_removed = on_disk.get("removedTasks") if isinstance(on_disk, dict) and isinstance(on_disk.get("removedTasks"), dict) else {} + removed_tasks = {**disk_removed, **incoming_removed} + data["removedTasks"] = removed_tasks + removed_ids = set(removed_tasks.keys()) + if removed_ids: + incoming_tasks = [ + t for t in incoming_tasks + if not (isinstance(t, dict) and t.get("sessionId") in removed_ids) + ] + data["tasks"] = incoming_tasks # Anti-poisoning guard: a stale browser tab can keep POSTing a # download task as status='done' from before the strict-finish # fix landed, undoing any server-side correction. For each @@ -2203,6 +2495,8 @@ def setup_cookbook_routes() -> APIRouter: sid = t.get("sessionId") if not sid or sid in incoming_ids: continue # client's version wins + if sid in removed_ids: + continue # intentional cross-device clear/remove ts = t.get("ts") or 0 if isinstance(ts, (int, float)) and (now_ms - ts) <= RACE_WINDOW_MS: preserved.append(t) @@ -2309,16 +2603,14 @@ def setup_cookbook_routes() -> APIRouter: # Add 30% headroom for KV cache, activations, etc. needed_vram = (est_vram * 1.3) if est_vram else None - if vram_gb > 0 and needed_vram is not None and needed_vram > vram_gb: - continue - # Unknown-size models (e.g. MiniMax-M2.7, DeepSeek-V4-Flash) have no - # "NB" in the repo id, so the regex above can't extract their - # param count. Previously we dropped them entirely, which made - # brand-new flagship releases silently vanish from this list even - # on rigs with hundreds of GB of VRAM. Adapters/LoRAs are already - # filtered by _is_excluded(), so what falls through here is - # overwhelmingly full models — keep them, just without a size - # badge (the frontend handles needed_vram_gb=null gracefully). + if vram_gb > 0: + if needed_vram is None: + # The "trending models that fit" list must be conservative: + # if we cannot estimate size from the repo id/tags, do not + # present it as runnable on this hardware. + continue + if needed_vram > vram_gb: + continue out.append({ "repo_id": repo_id, @@ -2515,6 +2807,33 @@ def setup_cookbook_routes() -> APIRouter: except Exception as e: logger.warning(f"orphan sweep: state write failed: {e}") + @router.get("/api/cookbook/hf-gguf-files") + async def hf_gguf_files(repo_id: str, owner: str = Depends(require_user)): + """List GGUF files in a HuggingFace repo for the direct-download picker.""" + import httpx + + repo_id = _validate_repo_id(repo_id) + url = f"https://huggingface.co/api/models/{repo_id}" + try: + headers = {} + token = _load_stored_hf_token() + if token: + headers["Authorization"] = f"Bearer {token}" + async with httpx.AsyncClient(timeout=15, follow_redirects=True) as client: + resp = await client.get(url, headers=headers) + if resp.status_code != 200: + return {"ok": False, "files": [], "error": f"HF API HTTP {resp.status_code}"} + data = resp.json() + except Exception: + logger.exception("HF GGUF file scan failed for %s", repo) + return {"ok": False, "files": [], "error": "HF API request failed"} + files = [ + str(s.get("rfilename") or "") + for s in data.get("siblings", []) + if str(s.get("rfilename") or "").lower().endswith(".gguf") + ] + return {"ok": True, "repo_id": repo_id, "files": files} + # In-memory cache for the Ollama library scrape. ollama.com is a public # site, but it doesn't expose a stable JSON listing — we fetch the HTML # search page and regex out the model cards. Cached for 1 h so a busy diff --git a/routes/email_helpers.py b/routes/email_helpers.py index e33b72182..513ec1f0a 100644 --- a/routes/email_helpers.py +++ b/routes/email_helpers.py @@ -1233,22 +1233,30 @@ def _list_attachments_from_msg(msg): return attachments idx = 0 for part in msg.walk(): - if part.is_multipart(): - continue cd = str(part.get("Content-Disposition", "")) ct = part.get_content_type() + is_attached_email = ct == "message/rfc822" and ("attachment" in cd.lower() or part.get_filename()) + if part.is_multipart() and not is_attached_email: + continue # Skip text/html body parts (only consider real attachments) if ct in ("text/plain", "text/html") and "attachment" not in cd: continue filename = part.get_filename() if filename: filename = _decode_header(filename) + if ct == "message/rfc822" and not re.search(r"\.[A-Za-z0-9]{1,8}$", filename): + filename = f"{filename}.eml" else: # Inline images, etc. - generate a name - ext = ct.split("/")[-1] if "/" in ct else "bin" + ext = "eml" if ct == "message/rfc822" else (ct.split("/")[-1] if "/" in ct else "bin") filename = f"attachment_{idx}.{ext}" payload = part.get_payload(decode=True) - size = len(payload) if payload else 0 + if payload is None and ct == "message/rfc822": + try: + payload = part.as_bytes() + except Exception: + payload = b"" + size = len(payload) if payload is not None else 0 attachments.append({ "index": idx, "filename": filename, @@ -1260,29 +1268,58 @@ def _list_attachments_from_msg(msg): return attachments +def _is_likely_signature_image_attachment(att: dict) -> bool: + """Match the reader's inline signature/logo image filter.""" + filename = str((att or {}).get("filename") or "").lower() + if not re.search(r"\.(png|jpe?g|gif|bmp|svg|webp)$", filename): + return False + size = int((att or {}).get("size") or 0) + if re.search(r"^image\d{3,}\.(png|jpe?g|gif)$", filename): + return True + if re.search(r"^(signature|logo|sig|footer|banner)[-_\d]*\.(png|jpe?g|gif|svg)$", filename): + return True + return 0 < size < 30 * 1024 + + +def _has_visible_attachments(msg) -> bool: + """Return True only for attachments the reader will render as chips.""" + return any( + not _is_likely_signature_image_attachment(att) + for att in _list_attachments_from_msg(msg) + ) + + def _extract_attachment_to_disk(msg, index, target_dir): """Extract a specific attachment to disk and return the file path.""" if not msg.is_multipart(): return None idx = 0 for part in msg.walk(): - if part.is_multipart(): - continue cd = str(part.get("Content-Disposition", "")) ct = part.get_content_type() + is_attached_email = ct == "message/rfc822" and ("attachment" in cd.lower() or part.get_filename()) + if part.is_multipart() and not is_attached_email: + continue if ct in ("text/plain", "text/html") and "attachment" not in cd: continue if idx == index: filename = part.get_filename() if filename: filename = _decode_header(filename) + if ct == "message/rfc822" and not re.search(r"\.[A-Za-z0-9]{1,8}$", filename): + filename = f"{filename}.eml" else: - ext = ct.split("/")[-1] if "/" in ct else "bin" + ext = "eml" if ct == "message/rfc822" else (ct.split("/")[-1] if "/" in ct else "bin") filename = f"attachment_{idx}.{ext}" # Sanitize safe_name = re.sub(r"[^\w\s\-.]", "_", filename).strip() payload = part.get_payload(decode=True) - if not payload: + if payload is None and ct == "message/rfc822": + try: + payload = part.as_bytes() + except Exception: + payload = b"" + if payload is None: return None target_dir.mkdir(parents=True, exist_ok=True) filepath = target_dir / safe_name diff --git a/routes/email_routes.py b/routes/email_routes.py index b9da5a82e..77be0cdeb 100644 --- a/routes/email_routes.py +++ b/routes/email_routes.py @@ -47,7 +47,7 @@ from routes.email_helpers import ( _IMAP_TIMEOUT_SECONDS, _open_imap_connection, make_oauth_state, verify_oauth_state, _imap_connect, _imap, _decode_header, _detect_sent_folder, _detect_drafts_folder, - _extract_attachment_text, _list_attachments_from_msg, + _extract_attachment_text, _list_attachments_from_msg, _has_visible_attachments, _is_likely_signature_image_attachment, _extract_attachment_to_disk, _extract_html, _extract_text, _fetch_sender_thread_context, _pre_retrieve_context, _EMAIL_REPLY_SYS_PROMPT_BASE, _POOL_HOOKS, @@ -61,6 +61,7 @@ from routes.email_pollers import _start_poller logger = logging.getLogger(__name__) ODYSSEUS_MAIL_ORIGIN = "odysseus-ui" +EMAIL_READ_ATTACHMENT_VERSION = 2 def _email_tag_owner_aliases(account_id: str | None, owner: str = "") -> list[str]: @@ -248,6 +249,21 @@ def _imap_uid_fetch(conn, uid_set: str | bytes, query: str): return conn.uid("FETCH", _uid_bytes(uid_set), query) +def _imap_search_quote(value: str) -> str: + return '"' + str(value or "").replace("\\", "\\\\").replace('"', '\\"') + '"' + + +def _message_id_chain(*values: str) -> list[str]: + seen = set() + out = [] + for value in values: + for mid in re.findall(r"<[^>]+>", value or ""): + if mid not in seen: + seen.add(mid) + out.append(mid) + return out + + def _uid_from_fetch_meta(meta_b: bytes) -> str: m = re.search(rb"\bUID\s+(\d+)\b", meta_b) return m.group(1).decode() if m else "" @@ -366,6 +382,21 @@ def _apply_odysseus_headers(msg, kind: str | None = None, ref_id: str | None = N msg["X-Odysseus-Ref"] = re.sub(r"[^A-Za-z0-9_.:-]", "-", ref_id)[:128] +def _normalize_addr_field(field: str) -> str: + """Strip the malformed-but-common trailing/leading commas and stray + whitespace from a To/Cc/Bcc string before it lands in the MIME header + or the SMTP envelope. Users often paste a single address with a + trailing comma (e.g. `felix@pewdiepie.com,`) and most MTAs reject the + resulting `To: felix@pewdiepie.com,` line as a syntax error. Collapse + any run of separator junk between addresses too.""" + if not field: + return field + # Split on commas, drop empty tokens, rejoin with a single ', '. + parts = [p.strip() for p in field.split(",")] + parts = [p for p in parts if p] + return ", ".join(parts) + + def _envelope_recipients(*fields: str) -> list: """Extract bare SMTP envelope addresses from one or more To/Cc/Bcc header strings. A naive `field.split(",")` corrupts display names that contain a @@ -994,6 +1025,65 @@ def setup_email_routes(): except Exception: pass + def _related_thread_attachments_sync( + folder: str, + account_id: str | None, + owner: str, + current_uid: str, + current_message_id: str, + in_reply_to: str, + references: str, + limit: int = 12, + ) -> list[dict]: + """Return visible attachments from referenced messages in this folder.""" + wanted_ids = _message_id_chain(references, in_reply_to) + current_mid = (current_message_id or "").strip() + wanted_ids = [mid for mid in wanted_ids if mid and mid != current_mid] + if not wanted_ids: + return [] + + related: list[dict] = [] + try: + with _imap(account_id, owner=owner) as conn: + conn.select(_q(folder), readonly=True) + # Search newest referenced messages first; cap work so opening + # a long thread stays bounded. + for mid in reversed(wanted_ids[-10:]): + if len(related) >= limit: + break + status, data = _imap_uid_search(conn, f'(HEADER Message-ID {_imap_search_quote(mid)})') + if status != "OK" or not data or not data[0]: + continue + for uid_b in reversed(data[0].split()[-3:]): + source_uid = uid_b.decode(errors="ignore") + if not source_uid or source_uid == str(current_uid): + continue + st2, msg_data = _imap_uid_fetch(conn, source_uid, "(BODY.PEEK[])") + if st2 != "OK" or not msg_data or not isinstance(msg_data[0], tuple): + continue + msg = email_mod.message_from_bytes(msg_data[0][1]) + source_from = _decode_header(msg.get("From", "")) + source_subject = _decode_header(msg.get("Subject", "")) + source_date = msg.get("Date", "") + for att in _list_attachments_from_msg(msg): + if _is_likely_signature_image_attachment(att): + continue + enriched = dict(att) + enriched.update({ + "source_uid": source_uid, + "source_folder": folder, + "source_message_id": (msg.get("Message-ID") or "").strip(), + "source_from": source_from, + "source_subject": source_subject, + "source_date": source_date, + }) + related.append(enriched) + if len(related) >= limit: + break + except Exception as e: + logger.debug(f"related thread attachment lookup failed uid={current_uid}: {e}") + return related + @router.get("/list") async def list_emails( folder: str = Query("INBOX"), @@ -1264,6 +1354,17 @@ def setup_email_routes(): sender_name, sender_addr = email.utils.parseaddr(sender) parsed_date = email.utils.parsedate_to_datetime(date_str) if date_str else None attachments = _list_attachments_from_msg(msg) + related_attachments = [] + if not _has_visible_attachments(msg): + related_attachments = _related_thread_attachments_sync( + folder, + account_id, + owner, + uid, + message_id, + in_reply_to, + references, + ) if mark_seen: # Set \Seen in a separate readwrite session so concurrent reads @@ -1372,6 +1473,8 @@ def setup_email_routes(): "body": body, "body_html": body_html, "attachments": attachments, + "related_attachments": related_attachments, + "attachment_version": EMAIL_READ_ATTACHMENT_VERSION, "cached_summary": cached_summary, "cached_ai_reply": cached_ai_reply, "boundaries": cached_boundaries, @@ -1402,6 +1505,12 @@ def setup_email_routes(): """Read email body. Cached for 30m, sync IMAP work runs in a thread.""" ck = _read_cache_key(account_id, folder, uid, owner=owner) cached = _read_cache_get(ck) + if cached is not None: + # Older cached read responses lack the thread-attachment fallback. + # Fetch once so replies that reference prior attachments can show + # those files without waiting for cache expiry. + if cached.get("attachment_version") != EMAIL_READ_ATTACHMENT_VERSION: + cached = None if cached is not None: if mark_seen: try: @@ -1536,6 +1645,12 @@ def setup_email_routes(): return {"error": f"Attachment index {index} not found"} from pathlib import Path as _Path + target_root = os.path.abspath(str(target_dir)) + filepath_str = os.path.abspath(str(filepath)) + if os.path.commonpath([target_root, filepath_str]) != target_root: + logger.warning("Rejected attachment path outside extraction dir: %s", filepath) + return {"error": "Invalid attachment path"} + filepath = _Path(filepath_str) base = _Path(filepath).name if base.startswith("."): return {"error": "Invalid filename", "filename": base} @@ -1590,6 +1705,65 @@ def setup_email_routes(): return None doc_session_id = _resolve_doc_session() + def _create_markdown_doc(content: str, summary: str): + from src.database import SessionLocal as _SL, Document as _Doc, DocumentVersion as _DV + doc_id = str(uuid.uuid4()) + ver_id = str(uuid.uuid4()) + _db = _SL() + try: + _db.query(_Doc).filter(_Doc.is_active == True).update({"is_active": False}) + _db.add(_Doc( + id=doc_id, session_id=doc_session_id, title=title, + language="markdown", current_content=content, + version_count=1, is_active=True, + )) + _db.add(_DV( + id=ver_id, document_id=doc_id, version_number=1, + content=content, summary=summary, source="upload", + )) + _db.commit() + finally: + _db.close() + _tag_doc_with_source(doc_id) + return doc_id + + def _attached_email_markdown(raw_bytes: bytes): + if not raw_bytes: + return f"# Attached email: {base}\n\n_(empty email attachment)_" + try: + attached_msg = email_mod.message_from_bytes(raw_bytes) + except Exception: + logger.exception("Failed to parse attached email %s", base) + return f"# Attached email: {base}\n\nCould not parse this email attachment." + + attached_subject = _decode_header(attached_msg.get("Subject", "")) or base + attached_from = _decode_header(attached_msg.get("From", "")) + attached_to = _decode_header(attached_msg.get("To", "")) + attached_cc = _decode_header(attached_msg.get("Cc", "")) + attached_date = attached_msg.get("Date", "") + attached_body = _extract_text(attached_msg).strip() + attached_atts = _list_attachments_from_msg(attached_msg) + + lines = [f"# Attached email: {attached_subject}", ""] + if attached_from: + lines.append(f"**From:** {attached_from}") + if attached_to: + lines.append(f"**To:** {attached_to}") + if attached_cc: + lines.append(f"**Cc:** {attached_cc}") + if attached_date: + lines.append(f"**Date:** {attached_date}") + lines.extend(["", "## Body", "", attached_body or "_(no readable body)_"]) + if attached_atts: + lines.extend(["", "## Attachments", ""]) + for att in attached_atts: + size = int(att.get("size") or 0) + size_label = f"{size} B" if size < 1024 else f"{round(size / 1024)} KB" + name = att.get("filename") or f"attachment_{att.get('index', '')}" + ctype = att.get("content_type") or "application/octet-stream" + lines.append(f"- {name} ({ctype}, {size_label})") + return "\n".join(lines).strip() + # ── PDF path (existing) ──────────────────────────────────── if ext == ".pdf": import shutil as _shutil @@ -1636,6 +1810,39 @@ def setup_email_routes(): _tag_doc_with_source(doc_id) return {"doc_id": doc_id, "filename": filepath.name} + # ── Attached email (.eml / message/rfc822) ──────────────── + if ext == ".eml": + def _attachment_bytes_from_msg(): + if not msg.is_multipart(): + return b"" + idx = 0 + for part in msg.walk(): + cd = str(part.get("Content-Disposition", "")) + ct = part.get_content_type() + is_attached_email = ct == "message/rfc822" and ("attachment" in cd.lower() or part.get_filename()) + if part.is_multipart() and not is_attached_email: + continue + if ct in ("text/plain", "text/html") and "attachment" not in cd: + continue + if idx == index: + payload = part.get_payload(decode=True) + if payload is None and ct == "message/rfc822": + try: + payload = part.as_bytes() + except Exception: + payload = b"" + return payload or b"" + idx += 1 + return b"" + + try: + content = _attached_email_markdown(_attachment_bytes_from_msg()) + except Exception: + logger.exception("Failed to read email attachment %s", base) + return {"error": "Failed to read email attachment", "filename": base} + doc_id = _create_markdown_doc(content, "Imported attached email") + return {"doc_id": doc_id, "filename": filepath.name} + # ── DOCX path: extract text → markdown document ─────────── if ext == ".docx": try: @@ -1673,25 +1880,7 @@ def setup_email_routes(): lines.append("") content = "\n".join(lines).strip() or f"_(empty {base})_" - from src.database import SessionLocal as _SL, Document as _Doc, DocumentVersion as _DV - doc_id = str(uuid.uuid4()) - ver_id = str(uuid.uuid4()) - _db = _SL() - try: - _db.query(_Doc).filter(_Doc.is_active == True).update({"is_active": False}) - _db.add(_Doc( - id=doc_id, session_id=doc_session_id, title=title, - language="markdown", current_content=content, - version_count=1, is_active=True, - )) - _db.add(_DV( - id=ver_id, document_id=doc_id, version_number=1, - content=content, summary="Imported from DOCX", source="upload", - )) - _db.commit() - finally: - _db.close() - _tag_doc_with_source(doc_id) + doc_id = _create_markdown_doc(content, "Imported from DOCX") return {"doc_id": doc_id, "filename": filepath.name} # ── Plain text / markdown ──────────────────────────────── @@ -1700,25 +1889,7 @@ def setup_email_routes(): content = filepath.read_text(encoding="utf-8", errors="replace") except Exception as e: return {"error": f"Failed to read text file: {e}", "filename": base} - from src.database import SessionLocal as _SL, Document as _Doc, DocumentVersion as _DV - doc_id = str(uuid.uuid4()) - ver_id = str(uuid.uuid4()) - _db = _SL() - try: - _db.query(_Doc).filter(_Doc.is_active == True).update({"is_active": False}) - _db.add(_Doc( - id=doc_id, session_id=doc_session_id, title=title, - language="markdown", current_content=content, - version_count=1, is_active=True, - )) - _db.add(_DV( - id=ver_id, document_id=doc_id, version_number=1, - content=content, summary="Imported from email attachment", source="upload", - )) - _db.commit() - finally: - _db.close() - _tag_doc_with_source(doc_id) + doc_id = _create_markdown_doc(content, "Imported from email attachment") return {"doc_id": doc_id, "filename": filepath.name} return {"error": f"Unsupported attachment type: {ext}", "filename": base} @@ -2027,6 +2198,9 @@ def setup_email_routes(): outer = MIMEMultipart("alternative") body_container = outer + to = _normalize_addr_field(to or "") + cc = _normalize_addr_field(cc or "") + bcc = _normalize_addr_field(bcc or "") outer["From"] = email.utils.formataddr((cfg.get("display_name") or "", cfg["from_address"])) outer["To"] = to if cc: @@ -2302,6 +2476,9 @@ def setup_email_routes(): outer = MIMEMultipart("alternative") body_container = outer + req.to = _normalize_addr_field(req.to or "") + req.cc = _normalize_addr_field(req.cc or "") + req.bcc = _normalize_addr_field(req.bcc or "") outer["From"] = email.utils.formataddr((cfg.get("display_name") or "", cfg["from_address"])) outer["To"] = req.to if req.cc: diff --git a/routes/hwfit_routes.py b/routes/hwfit_routes.py index 5e38b9ca3..0a1f00a60 100644 --- a/routes/hwfit_routes.py +++ b/routes/hwfit_routes.py @@ -1,8 +1,13 @@ +import json +import os import re +import shlex +import subprocess from copy import deepcopy from fastapi import APIRouter, HTTPException +from core.platform_compat import run_ssh_command from routes._validators import validate_remote_host, validate_ssh_port @@ -107,6 +112,73 @@ def _apply_manual_hardware(system, manual_mode="", manual_gpu_count="", manual_v return system +def _run_model_probe(host: str, ssh_port: str, cmd: str) -> str: + try: + if host: + r = run_ssh_command( + host, + ssh_port or None, + cmd, + timeout=15, + connect_timeout=5, + strict_host_key_checking=False, + text=True, + ) + else: + r = subprocess.run(["bash", "-lc", cmd], capture_output=True, text=True, timeout=15) + if r.returncode == 0: + return (r.stdout or "").strip() + except Exception: + return "" + return "" + + +def _inspect_model_path(model_path: str, host: str = "", ssh_port: str = "") -> dict: + """Read lightweight metadata from a local or SSH-visible HF model folder.""" + path = (model_path or "").strip() + if not path or path.startswith(("http://", "https://")): + return {} + if not (path.startswith("/") or path.startswith("~")): + return {} + + qpath = shlex.quote(path) + qconfig = shlex.quote(os.path.join(path, "config.json")) + out = {} + exists = _run_model_probe(host, ssh_port, f"test -d {qpath} && printf found || printf missing") + if exists != "found": + target = host or "local container" + out["model_probe_error"] = f"Model path is not visible on {target}: {path}" + return out + raw_config = _run_model_probe(host, ssh_port, f"test -f {qconfig} && sed -n '1,240p' {qconfig}") + if raw_config: + try: + cfg = json.loads(raw_config) + except Exception: + cfg = {} + for key in ("context_length", "max_position_embeddings", "n_ctx_train", "model_max_length", "max_seq_len"): + value = cfg.get(key) + if isinstance(value, (int, float)) and value > 0: + out["model_ctx_max"] = int(value) + break + else: + out["model_probe_error"] = f"config.json not found in model path: {path}" + + size_cmd = ( + f"find {qpath} -type f \\( -name '*.safetensors' -o -name '*.bin' -o -name '*.gguf' \\) " + "-printf '%s\\n' 2>/dev/null | awk '{s+=$1} END {if (s>0) printf \"%.6f\", s/1073741824}'" + ) + weights = _run_model_probe(host, ssh_port, size_cmd) + try: + weights_gb = float(weights) + except Exception: + weights_gb = 0.0 + if weights_gb > 0: + out["model_weights_gb"] = round(weights_gb, 3) + elif "model_probe_error" not in out: + out["model_probe_error"] = f"No model weight files found in: {path}" + return out + + def setup_hwfit_routes(): router = APIRouter(prefix="/api/hwfit", tags=["hwfit"]) @@ -235,7 +307,7 @@ def setup_hwfit_routes(): return {"system": system, "models": results} @router.get("/profiles") - def get_serve_profiles(model: str = "", host: str = "", ssh_port: str = "", platform: str = "", fresh: bool = False, serve_weights_gb: float = 0.0, serve_quant: str = ""): + def get_serve_profiles(model: str = "", model_path: str = "", host: str = "", ssh_port: str = "", platform: str = "", fresh: bool = False, serve_weights_gb: float = 0.0, serve_quant: str = ""): """Compute llama.cpp serve profiles (Quality/Balanced/Speed) for `model` against the detected hardware on `host` (or local). Returns concrete flags (n_gpu_layers, n_cpu_moe, cache_type, ctx) the serve UI can apply. @@ -260,8 +332,23 @@ def setup_hwfit_routes(): # "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct". s = (s or "").lower().strip() s = s.split("/")[-1] # drop org prefix - s = re.sub(r"[-_.]?gguf$", "", s) # drop trailing gguf marker - s = re.sub(r"[-_.](q\d[^/]*|iq\d[^/]*|fp8|bf16|f16|awq[^/]*|gptq[^/]*)$", "", s) + for suffix in ("-gguf", "_gguf", ".gguf", "gguf"): + if s.endswith(suffix): + s = s[: -len(suffix)] + break + cut_at = None + for idx, ch in enumerate(s): + if ch not in "-_." or idx + 1 >= len(s): + continue + suffix = s[idx + 1:] + if ( + suffix in {"fp8", "bf16", "f16"} + or suffix.startswith(("awq", "gptq", "iq")) + or (suffix.startswith("q") and len(suffix) > 1 and suffix[1].isdigit()) + ): + cut_at = idx + if cut_at is not None: + s = s[:cut_at] return s m = catalog.get(model) @@ -272,8 +359,16 @@ def setup_hwfit_routes(): if nn and (nn == want or want.endswith(nn) or nn.endswith(want)): m = entry break + path_meta = _inspect_model_path(model_path or model, host=host, ssh_port=ssh_port) if m is None: - return {"system": system, "profiles": [], "error": "model not in catalog"} + return { + "system": system, + "profiles": [], + "error": "model not in catalog", + "model_ctx_max": int(path_meta.get("model_ctx_max") or 0), + "model_weights_gb": float(path_meta.get("model_weights_gb") or 0), + "model_probe_error": path_meta.get("model_probe_error") or "", + } # Surface the model's trained context limit so the serve UI can clamp a # user-typed context down to it (asking for ctx > n_ctx_train overflows # and, with a quantized KV cache, can crash the GPU). @@ -283,6 +378,16 @@ def setup_hwfit_routes(): if isinstance(v, (int, float)) and v > 0: model_ctx_max = int(v) break + path_ctx_max = int(path_meta.get("model_ctx_max") or 0) + if path_ctx_max > 0: + model_ctx_max = max(model_ctx_max, path_ctx_max) + model_weights_gb = float(path_meta.get("model_weights_gb") or 0) + if model_weights_gb <= 0: + for k in ("min_vram_gb", "required_gb", "size_gb", "recommended_ram_gb", "min_ram_gb"): + v = m.get(k) + if isinstance(v, (int, float)) and v > 0: + model_weights_gb = float(v) + break return { "system": system, "profiles": compute_serve_profiles( @@ -291,6 +396,8 @@ def setup_hwfit_routes(): serve_quant=(serve_quant or None), ), "model_ctx_max": model_ctx_max, + "model_weights_gb": model_weights_gb, + "model_probe_error": path_meta.get("model_probe_error") or "", } @router.get("/image-models") diff --git a/routes/model_routes.py b/routes/model_routes.py index d8fde332b..fb9555438 100644 --- a/routes/model_routes.py +++ b/routes/model_routes.py @@ -406,8 +406,11 @@ def _endpoint_refresh_timeout(ep: Any, category: str) -> float: except Exception: val = 0 if val > 0: - return float(max(1, min(30, val))) - return 2.5 if category == "local" else 2.0 + return float(max(1, min(60, val))) + # llama.cpp and other local OpenAI-compatible servers can block briefly + # while warming/loading. A 2s local timeout makes working endpoints flicker + # offline before /v1/models is ready. + return 10.0 if category == "local" else 2.0 def _manual_refresh_timeout(ep: Any, category: str, requested: Any = None) -> float: @@ -474,7 +477,7 @@ def _explicit_model_list_timeout(base_url: str, endpoint_kind: str = "auto", req category = _classify_endpoint(base_url, kind) if kind in ("api", "proxy") or category == "api": return 30.0 - return 3.0 if _is_ollama_base(base_url) else 2.0 + return 15.0 if category == "local" else (3.0 if _is_ollama_base(base_url) else 2.0) def _cached_model_ids(ep: Any) -> List[str]: @@ -579,6 +582,18 @@ def _safe_build_headers(api_key: Optional[str], base_url: str) -> dict: return {"Authorization": f"Bearer {api_key}"} if api_key else {} +def _redact_url_for_log(url: str) -> str: + """Return a URL safe for logs by removing userinfo and query/fragment.""" + try: + parsed = urlparse(url or "") + host = parsed.hostname or "" + if parsed.port: + host = f"{host}:{parsed.port}" + return urlunparse((parsed.scheme, host, parsed.path, "", "", "")) + except Exception: + return "" + + def _is_discovery_only_provider(provider: str) -> bool: return provider == "chatgpt-subscription" @@ -711,6 +726,16 @@ def _effective_endpoint_kind(ep: Any, base_url: str) -> str: return "auto" +def _is_loading_model_response(resp: Any) -> bool: + if getattr(resp, "status_code", None) != 503: + return False + try: + body = resp.text or "" + except Exception: + body = "" + return "loading model" in body.lower() + + def _probe_endpoint(base_url: str, api_key: str = None, timeout: int = 5) -> List[str]: """Probe a base URL's /models endpoint and return list of model IDs. @@ -775,11 +800,14 @@ def _probe_endpoint(base_url: str, api_key: str = None, timeout: int = 5) -> Lis models.append(_e) return [m for m in models if _is_chat_model(m)] except httpx.HTTPStatusError as e: + if e.response is not None and _is_loading_model_response(e.response): + logger.info("Endpoint still loading model at %s", _redact_url_for_log(url)) + return [] if api_key: status = e.response.status_code if e.response is not None else "unknown" - logger.warning(f"Failed to probe {url} with API key: HTTP {status}") + logger.warning("Failed to probe %s with API key: HTTP %s", _redact_url_for_log(url), status) return [] - logger.warning(f"Failed to probe {url}: {e}") + logger.warning("Failed to probe %s: %s", _redact_url_for_log(url), e) except Exception as e: if api_key: logger.warning(f"Failed to probe {url} with API key: {e}") @@ -824,6 +852,15 @@ def _ping_endpoint(base_url: str, api_key: str = None, timeout: float = 1.5) -> or "ollama" in (parsed_base.hostname or "").lower() ) + def _is_loading_model_response(r) -> bool: + if getattr(r, "status_code", None) != 503: + return False + try: + body = r.text or "" + except Exception: + body = "" + return "loading model" in body.lower() + def _result_from_response(r) -> Dict[str, Any]: if 300 <= r.status_code < 400: loc = r.headers.get("location", "") @@ -840,6 +877,13 @@ def _ping_endpoint(base_url: str, api_key: str = None, timeout: float = 1.5) -> "status_code": r.status_code, "error": None, } + if _is_loading_model_response(r): + return { + "reachable": True, + "loading": True, + "status_code": r.status_code, + "error": "Loading model", + } return {"reachable": False, "status_code": r.status_code, "error": f"HTTP {r.status_code}"} last_error: Optional[str] = None @@ -872,7 +916,7 @@ def _ping_endpoint(base_url: str, api_key: str = None, timeout: float = 1.5) -> if 400 <= sc < 500 and sc not in (401, 403): models_url = _safe_build_models_url(base) try: - r2 = httpx.get(models_url, headers=headers, timeout=timeout, verify=llm_verify()) + r2 = httpx.get(models_url, headers=headers,timeout=timeout, verify=llm_verify()) result2 = _result_from_response(r2) if result2["reachable"]: return result2 @@ -1056,9 +1100,11 @@ def setup_model_routes(model_discovery): except Exception: return 0.0 - def _failure_delay(fails: int) -> float: + def _failure_delay(fails: int, *, empty_local: bool = False) -> float: if fails <= 0: return 0.0 + if empty_local: + return min(5.0 * (2 ** max(0, fails - 1)), 30.0) return min(_REFRESH_FAILURE_BASE * (2 ** max(0, fails - 1)), _REFRESH_FAILURE_MAX) def _should_refresh_endpoint(ep: Any, now: float, force: bool = False) -> tuple[bool, Dict[str, Any]]: @@ -1089,7 +1135,12 @@ def setup_model_routes(model_discovery): fails = int(state.get("fail_count") or 0) if fails and not force: last_failure = float(state.get("last_failure") or 0.0) - if now - last_failure < _failure_delay(fails): + empty_local = ( + not cached + and category == "local" + and str(getattr(ep, "id", "") or "").startswith("local-") + ) + if now - last_failure < _failure_delay(fails, empty_local=empty_local): return False, info if cached and not force: interval = _endpoint_refresh_interval(ep, category) @@ -1404,7 +1455,7 @@ def setup_model_routes(model_discovery): t0 = _time.time() ping = _ping_endpoint(base, ep.api_key, timeout=1.5) entry["latency_ms"] = round((_time.time() - t0) * 1000) - entry["status"] = "online" if ping.get("reachable") or cached_count else "offline" + entry["status"] = "loading" if ping.get("loading") else ("online" if ping.get("reachable") or cached_count else "offline") entry["error"] = ping.get("error") entry["model_count"] = cached_count or (len(ANTHROPIC_MODELS) if provider == "anthropic" else 0) except Exception as e: @@ -1578,9 +1629,37 @@ def setup_model_routes(model_discovery): # "everything's already cached" path because this branch only # runs for endpoints with an empty cached_models. if not all_models and not pinned and r.is_enabled: - ping = _ping_endpoint(r.base_url, r.api_key, timeout=3.5) + base_for_ping = _normalize_base(r.base_url) + kind_for_ping = _effective_endpoint_kind(r, base_for_ping) + ping_timeout = 10.0 if _classify_endpoint(base_for_ping, kind_for_ping) == "local" else 3.5 + ping = _ping_endpoint(r.base_url, r.api_key, timeout=ping_timeout) if ping.get("reachable"): - status = "empty" + status = "loading" if ping.get("loading") else "empty" + if ping.get("loading"): + base = _normalize_base(r.base_url) + kind = _effective_endpoint_kind(r, base) + results.append({ + "id": r.id, + "name": r.name, + "base_url": r.base_url, + "has_key": bool(r.api_key), + "api_key_fingerprint": _api_key_fingerprint(r.api_key), + "is_enabled": r.is_enabled, + "models": visible, + "pinned_models": pinned, + "hidden_count": len(hidden), + "online": True, + "status": status, + "ping_error": (ping or {}).get("error") if ping else None, + "model_type": getattr(r, "model_type", None) or "llm", + "supports_tools": getattr(r, "supports_tools", None), + "endpoint_kind": kind, + "category": _classify_endpoint(base, kind), + "model_refresh_mode": _endpoint_refresh_mode(r, kind), + "model_refresh_interval": getattr(r, "model_refresh_interval", None), + "model_refresh_timeout": getattr(r, "model_refresh_timeout", None), + }) + continue # Best-effort: if the probe came back reachable, try # to populate cached_models in the background so the # NEXT picker load shows "online" instead of "empty". @@ -1588,7 +1667,7 @@ def setup_model_routes(model_discovery): # "empty" status, and the existing background refresh # path will eventually fill it in too. try: - probed = _probe_endpoint(r.base_url, r.api_key, timeout=5) + probed = _probe_endpoint(r.base_url, r.api_key, timeout=max(5, int(ping_timeout))) if probed: r.cached_models = json.dumps(probed) db.commit() @@ -1766,7 +1845,7 @@ def setup_model_routes(model_discovery): model_ids = _probe_endpoint(base_url, api_key.strip() or None, timeout=explicit_timeout) if should_probe else [] ping = {"reachable": False, "error": None} if (should_probe or requested_kind in ("api", "proxy")) and not model_ids: - ping = _ping_endpoint(base_url, api_key.strip() or None, timeout=min(explicit_timeout, 2.0)) + ping = _ping_endpoint(base_url, api_key.strip() or None, timeout=min(explicit_timeout, 10.0)) if require_model_list and not model_ids: raise HTTPException(400, _model_endpoint_error_message(base_url, ping)) @@ -1833,7 +1912,7 @@ def setup_model_routes(model_discovery): "models": _merge_model_ids(model_ids, _pinned), "pinned_models": _pinned, "online": bool(model_ids) or bool(_pinned) or bool(ping.get("reachable")), - "status": "online" if (model_ids or _pinned) else ("empty" if ping.get("reachable") else "offline"), + "status": "online" if (model_ids or _pinned) else ("loading" if ping.get("loading") else ("empty" if ping.get("reachable") else "offline")), "ping_error": ping.get("error") if ping else None, "endpoint_kind": requested_kind, "category": _classify_endpoint(base_url, requested_kind), @@ -1858,11 +1937,11 @@ def setup_model_routes(model_discovery): configured_timeout = _parse_positive_int(model_refresh_timeout, minimum=1, maximum=60) probe_timeout = _explicit_model_list_timeout(base_url, requested_kind, configured_timeout) models = _probe_endpoint(base_url, api_key.strip() or None, timeout=probe_timeout) - ping = {"reachable": True, "error": None} if models else _ping_endpoint(base_url, api_key.strip() or None, timeout=min(probe_timeout, 2.0)) + ping = {"reachable": True, "error": None} if models else _ping_endpoint(base_url, api_key.strip() or None, timeout=min(probe_timeout, 10.0)) return { "base_url": base_url, "online": bool(models) or bool(ping.get("reachable")), - "status": "online" if models else ("empty" if ping.get("reachable") else "offline"), + "status": "online" if models else ("loading" if ping.get("loading") else ("empty" if ping.get("reachable") else "offline")), "ping_error": ping.get("error") if ping else None, "models": models, "count": len(models), diff --git a/routes/shell_routes.py b/routes/shell_routes.py index 112b9fbca..d133b9254 100644 --- a/routes/shell_routes.py +++ b/routes/shell_routes.py @@ -331,6 +331,9 @@ def add_user_install_bins_to_path(): candidates.append(os.path.join(site.USER_BASE, 'bin')) except Exception: pass + candidates.append(os.path.expanduser('~/bin')) + candidates.append(os.path.expanduser('~/llama.cpp/build/bin')) + candidates.append(os.path.expanduser('~/llama.cpp/build-vulkan/bin')) candidates.append(os.path.expanduser('~/.local/bin')) parts = os.environ.get('PATH', '').split(os.pathsep) if os.environ.get('PATH') else [] changed = False @@ -962,12 +965,84 @@ def setup_shell_routes() -> APIRouter: return StreamingResponse(generate(), media_type="text/event-stream") + def _os_id_from_release(text: str) -> str: + """Map /etc/os-release contents to a canonical family for our matrix.""" + if not text: + return "" + ids = [] + for line in text.splitlines(): + line = line.strip() + if line.startswith("ID=") or line.startswith("ID_LIKE="): + ids += line.split("=", 1)[1].strip().strip('"').split() + ids = [i.lower() for i in ids] + if any(x in ids for x in ("debian", "ubuntu", "linuxmint", "pop", "elementary")): + return "debian" + if any(x in ids for x in ("arch", "manjaro", "endeavouros", "cachyos", "garuda")): + return "arch" + if any(x in ids for x in ("fedora", "rhel", "centos", "rocky", "almalinux", "ol")): + return "fedora" + if "alpine" in ids: + return "alpine" + if any(x in ids for x in ("suse", "opensuse", "opensuse-leap", "opensuse-tumbleweed", "sles")): + return "suse" + return "" + + # Matrix lookup keyed on (os_family, backend) → (pkg_mgr_cmd_template, pkg_list_per_dep). + # Each `system_prereqs` name resolves to a list of OS-specific package + # names that get joined into the final `sudo apt install -y …` etc. + # command. Backend-specific extras (CUDA toolkit, ROCm, Vulkan headers) + # are added only when the detected backend needs them. + _PKG_NAMES = { + # canonical-name → {os_id: [actual_pkg_names_on_this_os]} + "cmake": {"debian": ["cmake"], "arch": ["cmake"], "fedora": ["cmake"], "alpine": ["cmake"], "suse": ["cmake"], "macos": ["cmake"]}, + "build-essential": {"debian": ["build-essential"], "arch": ["base-devel"], "fedora": ["gcc", "gcc-c++", "make"], "alpine": ["build-base"], "suse": ["gcc-c++", "make"], "macos": []}, + "g++": {"debian": ["g++"], "arch": ["gcc"], "fedora": ["gcc-c++"], "alpine": ["g++"], "suse": ["gcc-c++"], "macos": []}, + "gcc": {"debian": ["gcc"], "arch": ["gcc"], "fedora": ["gcc"], "alpine": ["gcc"], "suse": ["gcc"], "macos": []}, + "make": {"debian": ["make"], "arch": ["make"], "fedora": ["make"], "alpine": ["make"], "suse": ["make"], "macos": []}, + "git": {"debian": ["git"], "arch": ["git"], "fedora": ["git"], "alpine": ["git"], "suse": ["git"], "macos": ["git"]}, + "tmux": {"debian": ["tmux"], "arch": ["tmux"], "fedora": ["tmux"], "alpine": ["tmux"], "suse": ["tmux"], "macos": ["tmux"]}, + } + _BACKEND_EXTRAS = { + "cuda": {"debian": ["nvidia-cuda-toolkit"], "arch": ["cuda"], "fedora": ["cuda-toolkit"], "alpine": [], "suse": ["cuda"], "macos": []}, + "rocm": {"debian": ["rocm-dev"], "arch": ["rocm-hip-sdk"], "fedora": ["rocm-devel"], "alpine": [], "suse": ["rocm-dev"], "macos": []}, + "vulkan": {"debian": ["libvulkan-dev", "vulkan-tools"], "arch": ["vulkan-headers", "vulkan-tools"], "fedora": ["vulkan-headers", "vulkan-tools"], "alpine": ["vulkan-loader-dev", "vulkan-tools"], "suse": ["vulkan-devel", "vulkan-tools"], "macos": []}, + } + _PKG_MGR = { + "debian": "sudo apt install -y {pkgs}", + "arch": "sudo pacman -S --needed {pkgs}", + "fedora": "sudo dnf install -y {pkgs}", + "alpine": "sudo apk add {pkgs}", + "suse": "sudo zypper install -n {pkgs}", + "macos": "brew install {pkgs}", + } + + def _install_cmd_for_target(os_id: str, backend: str, missing: list[str]) -> str: + """Build a single OS+backend-aware install command for the missing prereqs.""" + if not os_id or os_id not in _PKG_MGR: + return "" + pkgs: list[str] = [] + seen: set[str] = set() + for m in missing: + for p in _PKG_NAMES.get(m, {}).get(os_id, []): + if p not in seen: + pkgs.append(p); seen.add(p) + # Add backend-specific extras only when the build would actually + # consume them (a CUDA toolkit isn't useful on a Vulkan box). + backend = (backend or "").lower() + for p in _BACKEND_EXTRAS.get(backend, {}).get(os_id, []): + if p not in seen: + pkgs.append(p); seen.add(p) + if not pkgs: + return "" + return _PKG_MGR[os_id].format(pkgs=" ".join(pkgs)) + @router.get("/api/cookbook/packages") async def list_packages( request: Request, host: str | None = None, ssh_port: str | None = None, venv: str | None = None, + backend: str | None = None, ): """Check which optional packages are installed. @@ -1016,6 +1091,12 @@ def setup_shell_routes() -> APIRouter: "kind": "system", "install_hint": "Install Docker on the selected server and allow this user to run docker.", }, + # Note: cmake / gcc / git are not separate dependency rows — + # they're declared as `system_prereqs` on llama_cpp (and any + # other engine that compiles from source) so they appear as + # an inline status note on that engine's row instead of + # cluttering the panel with raw OS package names that aren't + # meaningful product-level dependencies on their own. # ── LLM ── installs on GPU servers for model serving/downloading { "name": "hf_transfer", @@ -1027,9 +1108,16 @@ def setup_shell_routes() -> APIRouter: { "name": "llama_cpp", "pip": "llama-cpp-python[server]", - "desc": "Serve GGUF models via llama.cpp", + "desc": "Great for single-GPU or CPU inference with GGUF models", "category": "LLM", "target": "remote", + # Build-toolchain prereqs. Cookbook's launch bootstrap + # compiles llama-server from source when no prebuilt + # binary is present; without these the build aborts + # with `cmake: command not found`. Surfaced inline on + # this row so the user doesn't have to chase three + # separate OS-package rows. + "system_prereqs": ["cmake", "g++", "git"], }, { "name": "sglang", @@ -1041,7 +1129,7 @@ def setup_shell_routes() -> APIRouter: { "name": "vllm", "pip": "vllm", - "desc": "High-throughput LLM serving engine", + "desc": "Great for high-throughput multi-GPU inference", "category": "LLM", "target": "remote", }, @@ -1104,6 +1192,7 @@ def setup_shell_routes() -> APIRouter: # venv over SSH so a remote `pip install` actually reflects here. remote_status: dict = {} remote_details: dict = {} + remote_probe_error = "" remote_names = [ p["name"] for p in packages @@ -1142,16 +1231,56 @@ def setup_shell_routes() -> APIRouter: break except ValueError as e: raise HTTPException(400, str(e)) - except Exception: + except Exception as e: remote_status = {} - if host and remote_system_names: + remote_probe_error = f"SSH package probe failed: {str(e)[:160]}" + if "llama_cpp" in remote_names: + try: + inner = ( + 'export PATH="$HOME/.local/bin:$HOME/bin:' + '$HOME/llama.cpp/build/bin:$HOME/llama.cpp/build-vulkan/bin:$PATH"; ' + "command -v llama-server 2>/dev/null || true" + ) + argv = _ssh_base_argv(host, ssh_port) + [inner] + proc = await asyncio.create_subprocess_exec( + *argv, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + out, _err = await asyncio.wait_for(proc.communicate(), timeout=8) + llama_server_path = out.decode("utf-8", errors="replace").strip().splitlines() + llama_server_path = llama_server_path[-1].strip() if llama_server_path else "" + if llama_server_path: + remote_status["llama_cpp"] = True + probe = remote_details.setdefault("llama_cpp", {}) + if isinstance(probe, dict): + probe.setdefault("binaries", {})["llama-server"] = llama_server_path + except Exception as e: + if not remote_probe_error: + remote_probe_error = f"SSH llama-server probe failed: {str(e)[:160]}" + pass + # Union of system_names + every package's system_prereqs. Probing + # the prereqs alongside the main system deps in a single SSH call + # avoids a second round-trip per Cookbook → Dependencies refresh. + prereq_names: set[str] = set() + for p in packages: + for pr in p.get("system_prereqs") or []: + prereq_names.add(str(pr)) + all_system_names = list(set(remote_system_names) | prereq_names) + # Detect the target's OS family + read /etc/os-release in the same + # SSH round-trip as the prereq probe — used downstream to render a + # single OS-specific install command per row instead of dumping + # every distro's syntax onto the user. + target_os_id: str = "" + if host and all_system_names: try: checks = [] - for name in remote_system_names: + for name in all_system_names: qn = shlex.quote(name) checks.append( f"if command -v {qn} >/dev/null 2>&1; then echo {qn}=1; else echo {qn}=0; fi" ) + checks.append("echo '---OSREL---'; cat /etc/os-release 2>/dev/null || true") inner = " ; ".join(checks) argv = _ssh_base_argv(host, ssh_port) + [inner] proc = await asyncio.create_subprocess_exec( @@ -1161,20 +1290,45 @@ def setup_shell_routes() -> APIRouter: ) out, _err = await asyncio.wait_for(proc.communicate(), timeout=12) txt = out.decode("utf-8", errors="replace").strip() + _section, _osrel_lines = "probe", [] for line in txt.splitlines(): + if line.strip() == "---OSREL---": + _section = "osrel"; continue + if _section == "osrel": + _osrel_lines.append(line) + continue name, sep, value = line.strip().partition("=") - if sep and name in remote_system_names: + if sep and name in all_system_names: remote_status[name] = value == "1" + target_os_id = _os_id_from_release("\n".join(_osrel_lines)) except ValueError as e: raise HTTPException(400, str(e)) - except Exception: + except Exception as e: + if not remote_probe_error: + remote_probe_error = f"SSH system probe failed: {str(e)[:160]}" pass + elif not host: + # Local target — probe in-process so the inline install command + # still appears in the dep panel when the cookbook container + # itself is the selected server. + try: + with open("/etc/os-release", encoding="utf-8") as f: + target_os_id = _os_id_from_release(f.read()) + except Exception: + target_os_id = "" + if sys.platform == "darwin": + target_os_id = "macos" for pkg in packages: on_remote = bool(host and pkg.get("target") == "remote") probe = None if on_remote: - pkg["installed"] = bool(remote_status.get(pkg["name"], False)) + if remote_probe_error and pkg["name"] not in remote_status: + pkg["installed"] = None + pkg["probe_error"] = remote_probe_error + pkg["status_note"] = remote_probe_error + else: + pkg["installed"] = bool(remote_status.get(pkg["name"], False)) probe = remote_details.get(pkg["name"]) if isinstance(probe, dict): pkg["details"] = probe @@ -1230,6 +1384,104 @@ def setup_shell_routes() -> APIRouter: # 500 the entire packages panel; report it as not usable. pkg["installed"] = False + # llama_cpp partial-state probe: when the package is installed + # but the wheel was built CPU-only AND the target has NVIDIA + # hardware, mark the row as partial (yellow/orange) with a + # one-click upgrade to the CUDA wheel. Without this the row + # reads "ready" green while inference runs at 3 tok/s on GPU + # silicon — actively misleading. + if pkg["name"] == "llama_cpp" and pkg.get("installed"): + _native_llama_server = bool( + isinstance(probe, dict) + and isinstance(probe.get("binaries"), dict) + and probe["binaries"].get("llama-server") + ) + _gpu_capable = False + _has_nvidia_target = False + if _native_llama_server: + # Native llama-server is the launcher path Cookbook now + # prefers. Do not mark this as a CPU-only Python wheel just + # because llama-cpp-python is absent from the selected venv. + _gpu_capable = True + elif on_remote and host: + try: + # Activate the configured venv FIRST so the probe + # runs against the same python the launch script + # would activate. Without this prefix, bare + # `python3` was checked — which can disagree with + # the venv's wheel (e.g. user-site has CUDA wheel + # but venv has CPU-only), and the dep panel then + # showed "ready" green while every launch fell to + # CPU. + _vp = _venv_activate_prefix(venv) + probe = ( + f'{_vp}python3 -c "import llama_cpp; import sys; ' + 'sys.exit(0 if llama_cpp.llama_supports_gpu_offload() else 1)" ' + '&& echo llama_cpp_gpu=1 || echo llama_cpp_gpu=0; ' + 'command -v nvidia-smi >/dev/null 2>&1 ' + '&& nvidia-smi -L 2>/dev/null | grep -q "GPU " ' + '&& echo nvidia=1 || echo nvidia=0' + ) + argv = _ssh_base_argv(host, ssh_port) + [probe] + proc = await asyncio.create_subprocess_exec( + *argv, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, + ) + out, _ = await asyncio.wait_for(proc.communicate(), timeout=8) + txt = out.decode("utf-8", errors="replace") + if "llama_cpp_gpu=1" in txt: + _gpu_capable = True + if "nvidia=1" in txt: + _has_nvidia_target = True + except Exception: + pass + else: + try: + import llama_cpp as _lcp # type: ignore + _gpu_capable = bool(_lcp.llama_supports_gpu_offload()) + except Exception: + _gpu_capable = False + _has_nvidia_target = shutil.which("nvidia-smi") is not None + if (not _gpu_capable) and _has_nvidia_target: + pkg["partial"] = True + pkg["partial_reason"] = "Installed but CPU-only wheel — GPU detected on this target. Upgrade to a CUDA wheel for ~10× faster inference." + pkg["partial_action"] = "reinstall_llama_cpp_cuda" + # Attach per-package system_prereqs status. We probed each + # prereq name above; surface "Missing build deps: …" ONLY + # when the package itself is not installed — if the package + # works (e.g. llama-cpp-python already imports cleanly), the + # build toolchain is irrelevant and surfacing it as a red + # flag confuses users ("ready" + "missing" on the same row). + _prereqs = list(pkg.get("system_prereqs") or []) + if _prereqs: + if on_remote: + _pr_present = {n: bool(remote_status.get(n)) for n in _prereqs} + else: + _pr_present = {n: shutil.which(n) is not None for n in _prereqs} + pkg["system_prereqs_status"] = _pr_present + _missing = [n for n, ok in _pr_present.items() if not ok] + # Suppress the "missing build deps" hint when the package + # itself is installed — build deps are only relevant if + # the user would need to recompile from source. + if pkg.get("installed"): + _missing = [] + if _missing: + # Build a target-specific install command from the + # (os_family, backend) matrix when we know both. Fall + # back to the multi-distro hint only when the target's + # OS can't be classified (e.g. ssh probe failed). + _resolved_os = target_os_id or "debian" # safest default + _cmd = _install_cmd_for_target(_resolved_os, backend or "", _missing) + if _cmd and target_os_id: + _hint = "Missing build deps for this target: " + ", ".join(_missing) + pkg["install_cmd_for_target"] = _cmd + pkg["install_cmd_os"] = target_os_id + pkg["install_cmd_backend"] = (backend or "").lower() + else: + _hint = "Missing build deps: " + ", ".join(_missing) + ". Install via apt: cmake build-essential git / pacman: cmake base-devel git / dnf: cmake gcc-c++ make git / brew: cmake git." + _existing_note = pkg.get("status_note") or "" + pkg["status_note"] = (_existing_note + " — " + _hint) if _existing_note else _hint + pkg["build_deps_missing"] = _missing + if pkg.get("installed"): update_status = _package_pip_update_status(pkg, probe) pkg["pip_update_available"] = update_status.available @@ -1289,6 +1541,102 @@ def setup_shell_routes() -> APIRouter: return {"ok": True, "output": stdout.decode()[-200:]} return {"ok": False, "error": stderr.decode()[-300:]} + @router.post("/api/cookbook/install-system-deps") + async def install_system_deps(request: Request): + """Install OS-level system packages (cmake/build-essential/git/tmux) + on a remote target or in the local container. Admin only. + + Bounded by a per-package allowlist — anything outside the catalog + is rejected so the route can't be coerced into installing arbitrary + OS packages. Uses `sudo -n` (passwordless) so the call returns a + clear "needs sudo password" error instead of hanging when interactive + sudo is required. + """ + _require_admin(request) + body = await request.json() + raw = body.get("packages") or [] + host = (body.get("remote_host") or "").strip() + ssh_port = body.get("ssh_port") + # Names users can request — must match canonical names used in the + # deps catalog's `system_prereqs` field and on the System rows. + ALLOWED = {"cmake", "build-essential", "g++", "gcc", "git", "tmux", "make"} + pkgs = [str(p).strip() for p in raw if str(p).strip() in ALLOWED] + if not pkgs: + return {"ok": False, "error": "no installable packages requested (allowlist: " + ", ".join(sorted(ALLOWED)) + ")"} + # Re-map to the right package name per OS. apt/dpkg use the names + # as-is; pacman has base-devel for build-essential, etc. + def _apt(names): return list(names) + def _pacman(names): + return ["base-devel" if n == "build-essential" else n for n in names] + def _dnf(names): + out = [] + for n in names: + if n == "build-essential": out += ["gcc", "gcc-c++", "make"] + elif n == "g++": out += ["gcc-c++"] + else: out.append(n) + return out + def _brew(names): + return [n for n in names if n not in ("build-essential", "g++", "gcc", "make")] + # Build a single shell snippet that detects the package manager and + # runs the right install. Non-interactive sudo (-n) only — if sudo + # asks for a password the script reports it instead of hanging. + apt_pkgs = " ".join(shlex.quote(p) for p in _apt(pkgs)) + pac_pkgs = " ".join(shlex.quote(p) for p in _pacman(pkgs)) + dnf_pkgs = " ".join(shlex.quote(p) for p in _dnf(pkgs)) + brew_pkgs = " ".join(shlex.quote(p) for p in _brew(pkgs)) + # Error messages go to stderr (>&2) so the route's error field + # gets populated. Without the redirect, `echo "ERROR…"` on stdout + # left stderr empty and the frontend toast fell through to a + # bare "HTTP 200" instead of surfacing the real reason. + script = ( + 'set -e; ' + 'if ! sudo -n true 2>/dev/null; then ' + ' echo "ERROR: passwordless sudo unavailable on this target. Run once: sudo apt install -y ' + " ".join(pkgs) + ' (or your distro equivalent: pacman -S, dnf install, brew install). After that, Cookbook can install the rest." >&2; exit 2; fi; ' + 'if command -v apt-get >/dev/null 2>&1; then ' + f' sudo -n env DEBIAN_FRONTEND=noninteractive apt-get update -qq && sudo -n env DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends {apt_pkgs}; ' + 'elif command -v pacman >/dev/null 2>&1; then ' + f' sudo -n pacman -Sy --needed --noconfirm {pac_pkgs}; ' + 'elif command -v dnf >/dev/null 2>&1; then ' + f' sudo -n dnf install -y {dnf_pkgs}; ' + 'elif command -v brew >/dev/null 2>&1; then ' + f' brew install {brew_pkgs}; ' + 'else ' + ' echo "ERROR: no supported package manager (apt/pacman/dnf/brew) on this target." >&2; exit 3; fi' + ) + try: + if host: + argv = _ssh_base_argv(host, ssh_port) + [script] + else: + argv = ["bash", "-lc", script] + except ValueError as e: + raise HTTPException(400, str(e)) + try: + proc = await asyncio.create_subprocess_exec( + *argv, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE + ) + out, err = await asyncio.wait_for(proc.communicate(), timeout=180) + except asyncio.TimeoutError: + return {"ok": False, "error": "Install timed out after 180s"} + ok = (proc.returncode == 0) + # Combine stderr + (last lines of stdout) into a single error + # blob when ok=False — some package managers print useful failure + # context to stdout, and a script that exits via `echo ...; exit N` + # without `>&2` would otherwise hand back an empty error string + # and force the frontend to show a bare "HTTP 200". + err_txt = err.decode("utf-8", errors="replace").strip() + out_txt = out.decode("utf-8", errors="replace").strip() + if not ok: + tail_out = out_txt[-500:] if out_txt else "" + combined = err_txt or tail_out or f"exit code {proc.returncode}" + else: + combined = None + return { + "ok": ok, + "exit_code": proc.returncode, + "output": out_txt[-1000:], + "error": combined, + } + @router.post("/api/cookbook/rebuild-engine") async def rebuild_engine(request: Request): """Clear the cached llama.cpp build so the next serve recompiles. @@ -1309,7 +1657,8 @@ def setup_shell_routes() -> APIRouter: return {"ok": False, "error": f"Unsupported engine: {engine}"} host = str(body.get("remote_host") or "").strip() ssh_port = body.get("ssh_port") - cmd = _llama_cpp_rebuild_cmd() + update_source = bool(body.get("update_source")) + cmd = _llama_cpp_rebuild_cmd(update_source=update_source) try: argv = ( (_ssh_base_argv(host, ssh_port) + [cmd]) diff --git a/routes/upload_routes.py b/routes/upload_routes.py index 1e197dd49..8b8f2d292 100644 --- a/routes/upload_routes.py +++ b/routes/upload_routes.py @@ -3,11 +3,16 @@ import os import time import json import asyncio +import shutil +import uuid +from pathlib import Path from fastapi import APIRouter, Request, File, UploadFile, HTTPException from typing import List import logging from core.middleware import require_admin +from core.database import SessionLocal, GalleryImage from src.auth_helpers import effective_user +from src.constants import GENERATED_IMAGES_DIR from src.upload_handler import count_recent_uploads logger = logging.getLogger(__name__) @@ -50,6 +55,69 @@ def setup_upload_routes(upload_handler): raise HTTPException(404, "File not found") raise HTTPException(404, "File not found") + + def _promote_chat_image_to_gallery(meta: dict, owner: str | None) -> str | None: + """Make chat-uploaded images visible in Gallery without changing chat storage.""" + is_image_file = getattr(upload_handler, "is_image_file", None) + if not callable(is_image_file): + return None + if not is_image_file(meta.get("name", ""), meta.get("mime", "")): + return None + + source_path = meta.get("path") + if not source_path or not os.path.isfile(source_path): + return None + + db = SessionLocal() + try: + file_hash = meta.get("hash") + if file_hash: + q = db.query(GalleryImage).filter( + GalleryImage.file_hash == file_hash, + GalleryImage.is_active == True, # noqa: E712 + ) + if owner: + q = q.filter(GalleryImage.owner == owner) + existing = q.first() + if existing: + return existing.id + + image_dir = Path(GENERATED_IMAGES_DIR) + image_dir.mkdir(parents=True, exist_ok=True) + ext = Path(meta.get("name") or source_path).suffix.lower() + if ext not in {".png", ".jpg", ".jpeg", ".webp", ".gif"}: + mime_ext = { + "image/png": ".png", + "image/jpeg": ".jpg", + "image/jpg": ".jpg", + "image/webp": ".webp", + "image/gif": ".gif", + }.get(meta.get("mime", "")) + ext = mime_ext or ".png" + filename = f"{uuid.uuid4().hex[:12]}{ext}" + dest_path = image_dir / filename + shutil.copy2(source_path, dest_path) + + image_id = str(uuid.uuid4()) + db.add(GalleryImage( + id=image_id, + filename=filename, + prompt=meta.get("name") or "Chat upload", + model="chat-upload", + owner=owner, + file_hash=file_hash, + width=meta.get("width"), + height=meta.get("height"), + file_size=meta.get("size"), + )) + db.commit() + return image_id + except Exception as e: + db.rollback() + logger.warning("Failed to add chat image upload to gallery: %s", e) + return None + finally: + db.close() @router.post("") async def api_upload(request: Request, files: List[UploadFile] = File(...)): @@ -78,8 +146,10 @@ def setup_upload_routes(upload_handler): for u in files: try: - meta = upload_handler.save_upload(u, client_ip, owner=effective_user(request)) - out.append({ + owner = effective_user(request) + meta = upload_handler.save_upload(u, client_ip, owner=owner) + gallery_id = _promote_chat_image_to_gallery(meta, owner) + item = { "id": meta["id"], "name": meta["name"], "mime": meta["mime"], @@ -89,7 +159,10 @@ def setup_upload_routes(upload_handler): "width": meta.get("width"), "height": meta.get("height"), "is_duplicate": meta.get("is_duplicate", False) - }) + } + if gallery_id: + item["gallery_id"] = gallery_id + out.append(item) except HTTPException: raise except Exception as e: diff --git a/services/hwfit/hardware.py b/services/hwfit/hardware.py index 1c4529839..ddb53bb90 100644 --- a/services/hwfit/hardware.py +++ b/services/hwfit/hardware.py @@ -282,7 +282,17 @@ def _detect_amd(): "gpus": cards, "gpu_groups": groups, "homogeneous": len(groups) <= 1, - "backend": "rocm", + # Pick the actual runtime label: ROCm/HIP only when its + # toolchain is installed, otherwise Vulkan if vulkaninfo is + # present (mesa RADV works fine on RDNA/CDNA when ROCm + # packages are absent — see Strix Halo where ROCm support + # is still backporting). Reporting "rocm" on a Vulkan-only + # host misleads downstream env-var pinning + # (HIP_VISIBLE_DEVICES is a no-op there). + "backend": ( + "rocm" if (_run(["which", "rocminfo"]) or _run(["which", "hipconfig"])) + else ("vulkan" if _run(["which", "vulkaninfo"]) else "rocm") + ), "unified_memory": is_apu, # AMD ISA/family so downstream can tell datacenter Instinct (CDNA, # where vLLM/SGLang run AWQ/GPTQ reliably) from consumer Radeon diff --git a/src/agent_loop.py b/src/agent_loop.py index 219765a18..c90168324 100644 --- a/src/agent_loop.py +++ b/src/agent_loop.py @@ -541,17 +541,44 @@ def _section_text(name: str, default: str) -> str: return val if isinstance(val, str) and val.strip() else default +def _compact_tool_line(name: str, section: str) -> str: + """One-line fenced-tool usage hint for compact/local prompts.""" + text = (section or "").strip() + if not text: + return f"- `{name}`" + if text.startswith("- "): + return text + lines = [ln.strip() for ln in text.splitlines() if ln.strip()] + usage = [] + in_fence = False + for ln in lines: + if ln.startswith("```"): + usage.append(ln) + in_fence = not in_fence + if len(usage) >= 3: + break + continue + if in_fence and len(usage) < 3: + usage.append(ln) + if usage: + return f"- `{name}` — " + " ".join(usage) + return f"- `{name}` — " + lines[0][:160] + + def _assemble_prompt(tool_names: set, disabled_tools: set = None, compact: bool = False) -> str: """Build the system prompt with only the specified tools included.""" disabled = disabled_tools or set() included = tool_names - disabled if compact: - tool_list = ", ".join(sorted(included)) if included else "none" + tool_lines = [] + for name, _default_section in TOOL_SECTIONS.items(): + if name in included: + tool_lines.append(_compact_tool_line(name, _section_text(name, _default_section))) parts = [ - "You are an AI assistant with tool access.", - f"Available tools: {tool_list}.", - _API_AGENT_RULES, + _AGENT_PREAMBLE, + "## Available tools\n" + ("\n".join(tool_lines) if tool_lines else "none"), + _AGENT_RULES, ] parts.extend(_domain_rules_for_tools(included)) return "\n\n".join(parts) @@ -617,11 +644,6 @@ _API_HOSTS = frozenset([ "api.perplexity.ai", "api.x.ai", "ollama.com", "api.venice.ai", "api.kimi.com", "api.githubcopilot.com", - # Local OpenAI-compatible endpoints (llama.cpp, vLLM, LM Studio, etc.). - # Without these, `_is_api_model` falls back to keyword sniffing on the - # model name, so well-behaved local servers don't get native tool - # schemas and the agent silently degrades to fenced-block parsing. - "localhost", "127.0.0.1", "host.docker.internal", ]) _MCP_KEYWORDS = frozenset(["mcp", "browse", "browser", "website", "calendar", "event", "email", "gmail", "screenshot", "navigate", "click", "miniflux", "rss", "feed"]) @@ -649,6 +671,28 @@ def _is_ollama_openai_compat_url(endpoint_url: str) -> bool: return parsed.port == 11434 and (path == "/v1" or path.startswith("/v1/")) +def _is_local_openai_compat_url(endpoint_url: str) -> bool: + try: + parsed = urlparse(endpoint_url or "") + except Exception: + return False + host = (parsed.hostname or "").lower() + path = (parsed.path or "").rstrip("/") + if not (path == "/v1" or path.startswith("/v1/")): + return False + if host in {"localhost", "127.0.0.1", "0.0.0.0", "host.docker.internal"}: + return True + if host.startswith("192.168.") or host.startswith("10."): + return True + if host.startswith("172."): + try: + second = int(host.split(".")[1]) + return 16 <= second <= 31 + except Exception: + return False + return False + + def _endpoint_lookup_keys(endpoint_url: str) -> List[str]: """Candidate ModelEndpoint.base_url keys for a runtime chat URL.""" raw = (endpoint_url or "").strip() @@ -712,6 +756,17 @@ def _extract_last_user_message(messages: List[Dict]) -> str: _LOW_SIGNAL_RE = re.compile(r"^[\W_]*$", re.UNICODE) +_CASUAL_OPENING_RE = re.compile( + r"^\s*(?:h+i+|hey+|hello+|yo+|sup+|what'?s up|wass?up|hiya|howdy|" + r"lol|lmao|haha+|hehe+|thanks?|thank you|ty|idk|dunno|meh|bruh|bro)\b(?P.*)$", + re.IGNORECASE, +) +_CASUAL_BLOCKLIST_RE = re.compile( + r"\b(?:cookbook|serve|serving|launch|start|vllm|sglang|llama\.?cpp|ollama|" + r"download|model|email|document|doc|note|calendar|task|search|web|research|" + r"file|folder|repo|git|settings?|endpoint|api|token|mcp)\b", + re.IGNORECASE, +) _EXPLICIT_CONTINUATION_RE = re.compile( r"^\s*(?:" r"yes|y|yeah|yep|ok|okay|sure|do it|go ahead|continue|carry on|" @@ -721,6 +776,17 @@ _EXPLICIT_CONTINUATION_RE = re.compile( r")\s*[.!?]*\s*$", re.IGNORECASE, ) +_RETRY_CONTINUATION_RE = re.compile( + r"\b(?:try again|retry|again|rerun|re-run|run it again|launch it again|" + r"start it again|failed|fails?|died|crashed|broke|insta|instantly)\b", + re.IGNORECASE, +) +_COOKBOOK_CONTEXT_RE = re.compile( + r"\b(?:cookbook|serve|serving|served|launch|start|preset|vllm|sglang|" + r"llama\.?cpp|ollama|download|cached models?|model servers?|running models?|" + r"gpu box|ajax|qwen|gemma|llama|mistral|minimax)\b", + re.IGNORECASE, +) def _is_explicit_continuation(text: str) -> bool: @@ -728,6 +794,37 @@ def _is_explicit_continuation(text: str) -> bool: return bool(_EXPLICIT_CONTINUATION_RE.match(str(text or "").strip())) +def _is_casual_low_signal(text: str) -> bool: + """True for short greetings/slang that should not inherit stale context.""" + s = str(text or "").strip() + m = _CASUAL_OPENING_RE.match(s) + if not m: + return False + tail = m.group("tail") or "" + if _CASUAL_BLOCKLIST_RE.search(tail): + return False + # Allow a short vocative/address after the opener without hardcoding the + # address term itself: "hey man", "yo dude", "sup ". Longer tails are + # more likely to be an actual request and should get normal context/tooling. + tail_words = re.findall(r"[A-Za-z0-9_'-]+", tail) + return len(tail_words) <= 2 + + +def _is_contextual_retry_continuation(messages: List[Dict], text: str) -> bool: + """Treat "try again / it failed" as a continuation only for active tool work. + + These follow-ups are common after Cookbook launches: the latest user turn + says only "try again it failed", while the actionable model/host/command + details live one or two turns back. Keep this intentionally narrow so + ordinary chat does not inherit stale Cookbook context. + """ + latest = str(text or "").strip() + if not latest or not _RETRY_CONTINUATION_RE.search(latest): + return False + recent = _recent_context_for_retrieval(messages, max_user=5, max_chars=1200) + return bool(_COOKBOOK_CONTEXT_RE.search(recent)) + + def _assistant_requested_followup(messages: List[Dict]) -> bool: """True when the previous assistant turn asked for missing task details. @@ -769,11 +866,12 @@ def _classify_agent_request(messages: List[Dict], last_user: str) -> Dict[str, o which domain rule packs get appended to the system prompt. """ text = str(last_user or "").strip() - continuation = _is_explicit_continuation(text) or _assistant_requested_followup(messages) + retry_continuation = _is_contextual_retry_continuation(messages, text) + continuation = _is_explicit_continuation(text) or _assistant_requested_followup(messages) or retry_continuation retrieval_query = _recent_context_for_retrieval(messages) if continuation else text q = retrieval_query.lower() - if not text or bool(_LOW_SIGNAL_RE.match(text)): + if not text or bool(_LOW_SIGNAL_RE.match(text)) or _is_casual_low_signal(text): return { "low_signal": True, "continuation": False, @@ -886,6 +984,7 @@ def _build_system_prompt( compact: bool = False, owner: Optional[str] = None, suppress_local_context: bool = False, + suppress_skills: bool = False, active_email: Optional[Dict[str, str]] = None, ) -> List[Dict]: """Build agent system prompt, inject MCP/document context, merge consecutive system msgs.""" @@ -903,7 +1002,7 @@ def _build_system_prompt( _ov_sig = _hl.sha256(_json.dumps(get_builtin_overrides() or {}, sort_keys=True).encode()).hexdigest() except Exception: _ov_sig = "" - cache_key = (frozenset(disabled_tools or []), bool(mcp_mgr), needs_admin, _rt_key, compact, _ov_sig, owner, suppress_local_context) + cache_key = (frozenset(disabled_tools or []), bool(mcp_mgr), needs_admin, _rt_key, compact, _ov_sig, owner, suppress_local_context, suppress_skills) if _cached_base_prompt and _cached_base_prompt_key == cache_key and not active_document: agent_prompt = _cached_base_prompt # Skill index is user-editable (name + description), so it must never @@ -913,6 +1012,7 @@ def _build_system_prompt( disabled_tools, mcp_mgr, needs_admin, relevant_tools, mcp_disabled_map=mcp_disabled_map, compact=compact, owner=owner, suppress_local_context=suppress_local_context, + suppress_skills=suppress_skills, ) else: agent_prompt, _skill_index_block = _build_base_prompt( @@ -924,6 +1024,7 @@ def _build_system_prompt( compact=compact, owner=owner, suppress_local_context=suppress_local_context, + suppress_skills=suppress_skills, ) if not active_document: _cached_base_prompt = agent_prompt @@ -1207,7 +1308,7 @@ def _build_system_prompt( # few. If the teacher wrote a procedure for "open my X chat" last # time the student failed, this is where the student finds it # before deciding which tool to call. - if not suppress_local_context: + if not suppress_local_context and not suppress_skills: try: last_user = _extract_last_user_message(messages) # Respect the user's skills-enabled toggle (mirrors memory_enabled). @@ -1374,6 +1475,7 @@ def _build_base_prompt( compact: bool = False, owner: Optional[str] = None, suppress_local_context: bool = False, + suppress_skills: bool = False, ): """Build the agent prompt with only relevant tools included. @@ -1426,7 +1528,7 @@ def _build_base_prompt( # The caller wraps it in untrusted_context_message and ships it as a # user-role message — same treatment as the matched-skills block. skill_index_block = "" - if not suppress_local_context: + if not suppress_local_context and not suppress_skills: try: from services.memory.skills import SkillsManager from src.constants import DATA_DIR @@ -1851,6 +1953,7 @@ async def stream_agent_loop( approved_plan: Optional[str] = None, tool_policy: Optional[ToolPolicy] = None, workspace: Optional[str] = None, + forced_tools: Optional[Set[str]] = None, _is_teacher_run: bool = False, ) -> AsyncGenerator[str, None]: """Streaming agent loop generator. @@ -1890,6 +1993,20 @@ async def stream_agent_loop( _needs_admin = _detect_admin_intent(messages) _last_user = _extract_last_user_message(messages) _intent = _classify_agent_request(messages, _last_user) + _low_signal_turn = bool(_intent.get("low_signal")) + _casual_low_signal_turn = _is_casual_low_signal(_last_user) + _direct_low_signal = ( + _low_signal_turn + and not bool(_intent.get("continuation")) + and not plan_mode + and not approved_plan + and not guide_only + and (_casual_low_signal_turn or active_document is None) + and (_casual_low_signal_turn or not active_email) + and (_casual_low_signal_turn or not workspace) + and not forced_tools + and not relevant_tools + ) # Tool retrieval uses the latest message by default. It may inherit recent # user turns only for explicit continuations ("yes", "do it", "1"). _retrieval_query = str(_intent.get("retrieval_query") or _last_user) @@ -1897,11 +2014,86 @@ async def stream_agent_loop( "[agent-intent] latest=%r continuation=%s low_signal=%s domains=%s retrieval_query=%r", _last_user[:120], bool(_intent.get("continuation")), - bool(_intent.get("low_signal")), + _low_signal_turn, sorted(_intent.get("domains") or []), _retrieval_query[:200], ) _mcp_disabled_map = _load_mcp_disabled_map() if mcp_mgr else {} + if _direct_low_signal: + logger.info("[agent] direct low-signal reply path for latest=%r", _last_user[:80]) + direct_messages = [{"role": "user", "content": _last_user}] + direct_response = "" + direct_start = time.time() + direct_actual_model = model + real_input_tokens = 0 + real_output_tokens = 0 + try: + async for chunk in stream_llm_with_fallback( + [(endpoint_url, model, headers)] + list(fallbacks or []), + direct_messages, + temperature=temperature, + max_tokens=min(max_tokens or 128, 128), + prompt_type=None, + tools=None, + timeout=int(get_setting("agent_stream_timeout_seconds", 300) or 300), + session_id=session_id, + ): + if chunk.startswith("data: ") and not chunk.startswith("data: [DONE]"): + try: + data = json.loads(chunk[6:]) + except json.JSONDecodeError: + yield chunk + continue + if data.get("type") == "usage": + usage = data.get("data", {}) or {} + direct_actual_model = usage.get("model") or direct_actual_model + real_input_tokens += usage.get("input_tokens", 0) or 0 + real_output_tokens += usage.get("output_tokens", 0) or 0 + continue + if data.get("type") == "model_actual": + direct_actual_model = data.get("model") or direct_actual_model + data["requested_model"] = model + yield f"data: {json.dumps(data)}\n\n" + continue + if data.get("type") == "fallback": + direct_actual_model = data.get("answered_by") or direct_actual_model + yield chunk + continue + if "delta" in data: + if not data.get("thinking"): + direct_response += data.get("delta", "") + yield chunk + continue + yield chunk + elif chunk.startswith("event: "): + yield chunk + except Exception as _direct_err: + logger.warning("[agent] direct low-signal path failed: %s", _direct_err) + fallback = "Hey." + direct_response += fallback + yield f"data: {json.dumps({'delta': fallback})}\n\n" + + if not direct_response.strip(): + fallback = "Hey." + direct_response = fallback + yield f"data: {json.dumps({'delta': fallback})}\n\n" + + duration = time.time() - direct_start + metrics = { + "model": direct_actual_model, + "requested_model": model, + "input_tokens": real_input_tokens or estimate_tokens(direct_messages), + "output_tokens": real_output_tokens or max(len(direct_response) // 4, 1), + "total_time": round(duration, 2), + "response_time": round(duration, 2), + "agent_rounds": 0, + "tool_calls": 0, + "direct_low_signal": True, + } + yield f"data: {json.dumps({'type': 'metrics', 'data': metrics})}\n\n" + yield "data: [DONE]\n\n" + return + if plan_mode and mcp_mgr: # Allow read-only MCP tools to investigate, block write/unknown ones: # hide them from the schemas AND reject them at runtime by qualified name. @@ -1913,11 +2105,11 @@ async def stream_agent_loop( # RAG-based tool selection: retrieve relevant tools for this query. # If caller provided a pre-computed set (e.g. task_scheduler), use that. - _relevant_tools = set() if guide_only else relevant_tools + _relevant_tools = relevant_tools _t1 = time.time() if _relevant_tools: logger.info(f"[tool-rag] Using caller-provided relevant_tools ({len(_relevant_tools)} tools)") - if not guide_only and not _relevant_tools and bool(_intent.get("low_signal")): + if not guide_only and not _relevant_tools and _low_signal_turn: from src.tool_index import ALWAYS_AVAILABLE if workspace: # An active workspace IS the file-work signal: a vague "look at the @@ -2008,6 +2200,15 @@ async def stream_agent_loop( if _relevant_tools is not None and active_document is not None: _relevant_tools.update({"edit_document", "update_document", "suggest_document"}) + # Per-request UI toggles are stronger than retrieval. If the user turns on + # Search, the model must see the search tools even when the latest text is a + # typo or otherwise low-signal for tool RAG. + if not guide_only and forced_tools: + if _relevant_tools is None: + from src.tool_index import ALWAYS_AVAILABLE + _relevant_tools = set(ALWAYS_AVAILABLE) + _relevant_tools.update(t for t in forced_tools if t not in disabled_tools) + # The skill index injected by _build_system_prompt tells the model to # call `manage_skills action=view`, and Jaccard-matched skills are pasted # into the prompt as procedures to follow — but neither path goes through @@ -2015,7 +2216,7 @@ async def stream_agent_loop( # (grep, read_file, ...) that aren't in its schema list. Keep the schemas # in lockstep: manage_skills is callable whenever any skill is indexed, # and a matched skill's declared requires_toolsets ride along with it. - if not guide_only and _relevant_tools is not None: + if not guide_only and _relevant_tools is not None and not _low_signal_turn: try: from services.memory.skills import SkillsManager from src.constants import DATA_DIR @@ -2080,7 +2281,7 @@ async def stream_agent_loop( _model_supports_tools = any(kw in _model_lc for kw in ( "gpt-4", "gpt-5", "gpt-o", "claude", "gemini", "gemma", "qwen3", "qwen2.5", "mixtral", "mistral", "llama-3.1", "llama-3.2", - "llama-3.3", "llama-4", + "llama-3.3", "llama-4", "llama3.1", "llama3.2", "llama3.3", "llama4", # Local-served models that follow OpenAI-style function calling # via vLLM's `--enable-auto-tool-choice`. Belt-and-suspenders # with the per-endpoint flag above. @@ -2122,13 +2323,15 @@ async def stream_agent_loop( _is_api_model = False else: _is_api_model = any(h in endpoint_url for h in _API_HOSTS) or _model_supports_tools + _compact_agent_prompt = _is_api_model or _is_ollama_native or _ollama_openai_compat messages, mcp_schemas = _build_system_prompt( messages, model, active_document, mcp_mgr, disabled_tools, needs_admin=_needs_admin, relevant_tools=_relevant_tools, mcp_disabled_map=_mcp_disabled_map, - compact=_is_api_model, + compact=_compact_agent_prompt, owner=owner, suppress_local_context=guide_only, + suppress_skills=_low_signal_turn, active_email=active_email, ) if plan_mode and not guide_only: @@ -2214,6 +2417,14 @@ async def stream_agent_loop( # Strip internal metadata keys before sending to the LLM API messages = [{k: v for k, v in msg.items() if k != "_protected"} for msg in messages] + agent_prompt_tokens = estimate_tokens(messages) + logger.info( + "[agent-timing] prep_done model=%s prompt_tokens=%s context_length=%s prep=%s", + model, + agent_prompt_tokens, + context_length, + {k: round(v, 3) for k, v in prep_timings.items()}, + ) yield f"data: {json.dumps({'type': 'agent_prep', 'data': {k: round(v, 3) for k, v in prep_timings.items()}})}\n\n" full_response = "" @@ -2358,6 +2569,19 @@ async def stream_agent_loop( # complementary cap for the rare stream that trickles bytes forever and # so never trips the inactivity timeout. Generous — only catches runaway. _round_deadline = time.time() + max(agent_stream_timeout * 4, 1200) + _round_start = time.time() + _round_first_event_logged = False + _round_first_token_logged = False + logger.info( + "[agent-timing] round_start round=%s model=%s endpoint=%s prompt_tokens=%s tools=%s native_tools=%s timeout=%s", + round_num, + model, + endpoint_url, + estimate_tokens(messages), + len(_tool_names_sent), + bool(all_tool_schemas), + agent_stream_timeout, + ) async for chunk in stream_llm_with_fallback( _candidates, messages, @@ -2368,11 +2592,30 @@ async def stream_agent_loop( timeout=agent_stream_timeout, session_id=session_id, ): + if not _round_first_event_logged: + _round_first_event_logged = True + logger.info( + "[agent-timing] first_event round=%s elapsed=%.3fs kind=%s", + round_num, + time.time() - _round_start, + "error" if chunk.startswith("event: error") else "data", + ) if time.time() > _round_deadline: - logger.warning(f"[agent] round {round_num} stream exceeded wall-clock deadline; cutting off") + logger.warning( + "[agent-timing] round_deadline round=%s elapsed=%.3fs deadline_s=%s", + round_num, + time.time() - _round_start, + max(agent_stream_timeout * 4, 1200), + ) break # Forward error events from stream_llm to the frontend if chunk.startswith("event: error"): + logger.warning( + "[agent-timing] stream_error round=%s elapsed=%.3fs chunk=%r", + round_num, + time.time() - _round_start, + chunk[:500], + ) yield chunk continue if chunk.startswith("data: ") and not chunk.startswith("data: [DONE]"): @@ -2452,6 +2695,15 @@ async def stream_agent_loop( if not first_token_received: time_to_first_token = time.time() - total_start first_token_received = True + if not _round_first_token_logged: + _round_first_token_logged = True + logger.info( + "[agent-timing] first_visible_token round=%s elapsed=%.3fs total_elapsed=%.3fs thinking=%s", + round_num, + time.time() - _round_start, + time.time() - total_start, + bool(data.get("thinking")), + ) # Keep reasoning deltas in a separate accumulator so # we can echo them back via `reasoning_content` on the # next request (DeepSeek requires this; harmless for @@ -2521,7 +2773,21 @@ async def stream_agent_loop( yield chunk # Intercept [DONE] — don't forward until all rounds finish - tool_blocks, used_native = _resolve_tool_blocks(round_response, native_tool_calls, round_num, is_api_model=_is_api_model) + logger.info( + "[agent-timing] round_stream_done round=%s elapsed=%.3fs text_chars=%s tool_calls=%s first_event=%s first_token=%s", + round_num, + time.time() - _round_start, + len(round_response), + len(native_tool_calls), + _round_first_event_logged, + _round_first_token_logged, + ) + tool_blocks, used_native = _resolve_tool_blocks( + round_response, + native_tool_calls, + round_num, + is_api_model=(_is_api_model and not guide_only), + ) # Force-answer round: we told the model to STOP calling tools and # answer. If it ignored that and emitted a (possibly DSML) tool @@ -2605,7 +2871,7 @@ async def stream_agent_loop( # model with no real native_tool_calls) must not be stripped from the # persisted text either — otherwise it streams once and then disappears # on reload (#3222 follow-up). - cleaned_round = strip_tool_blocks(round_response, skip_fenced=(_is_api_model and not used_native)).strip() + cleaned_round = strip_tool_blocks(round_response, skip_fenced=(_is_api_model and not used_native and not guide_only)).strip() round_texts.append(cleaned_round) if not tool_blocks: @@ -2677,6 +2943,15 @@ async def stream_agent_loop( _intent_nudge_count += 1 _matched_phrase = _intent_match.group(0).strip() logger.info(f"[agent] intent-without-action nudge #{_intent_nudge_count} on round {round_num}: {_matched_phrase!r}") + _lower_phrase = _matched_phrase.lower() + _cookbook_log_hint = "" + if any(_word in _lower_phrase for _word in ("log", "logs", "output", "tail", "status")): + _cookbook_log_hint = ( + " If this is about a Cookbook/model serve, the concrete calls are: " + "`list_served_models` first, then `tail_serve_output` with the " + "session_id from the serve/list result. Never answer with " + "\"check logs\" when those tools are available." + ) messages.append({ "role": "system", "content": ( @@ -2685,6 +2960,7 @@ async def stream_agent_loop( "see you announced the action but didn't run it, which " "is the most frustrating thing you can do. " "DO IT NOW: emit the actual function call this turn. " + f"{_cookbook_log_hint}" "If you decided not to do it after all, say so plainly in " "one sentence instead of restating the plan." ), diff --git a/src/agent_runs.py b/src/agent_runs.py index 8adbab9c9..3431347c7 100644 --- a/src/agent_runs.py +++ b/src/agent_runs.py @@ -174,8 +174,20 @@ async def subscribe(session_id: str) -> AsyncGenerator[str, None]: next_seq += 1 if run.status != "running": return + heartbeat_idx = 0 while True: - seq, ev = await q.get() + try: + seq, ev = await asyncio.wait_for(q.get(), timeout=10.0) + except asyncio.TimeoutError: + # Keep slow local models/proxies alive while they prefill before + # the first token. SSE comments are ignored by the UI but reset + # browser/proxy idle timers, which prevents "empty response" + # disconnects on llama.cpp first-token latencies of 30s+. + if run.status == "running": + heartbeat_idx += 1 + yield f": heartbeat {heartbeat_idx}\n\n" + continue + seq, ev = (None, None) if seq is None: # end sentinel while next_seq < len(run.buffer): # flush any tail the sentinel raced yield run.buffer[next_seq] diff --git a/src/agent_tools/web_tools.py b/src/agent_tools/web_tools.py index 9c1d2ca97..19ece8cd8 100644 --- a/src/agent_tools/web_tools.py +++ b/src/agent_tools/web_tools.py @@ -7,6 +7,7 @@ from src.constants import MAX_OUTPUT_CHARS class WebSearchTool: async def execute(self, content: str, ctx: dict) -> dict: from src.search import comprehensive_web_search + progress_cb = ctx.get("progress_cb") if isinstance(ctx, dict) else None raw = content.strip() query = raw time_filter = None @@ -37,18 +38,39 @@ class WebSearchTool: elif " news" in q_lc or q_lc.startswith("news ") or q_lc.endswith(" news"): time_filter = "week" loop = asyncio.get_running_loop() - text, sources = await asyncio.wait_for( - loop.run_in_executor( - None, - lambda: comprehensive_web_search( - query, - max_pages=max_pages, - time_filter=time_filter, - return_sources=True, + if progress_cb: + await progress_cb({ + "elapsed_s": 0, + "tail": f"Searching web for: {query[:160]}", + }) + try: + text, sources = await asyncio.wait_for( + loop.run_in_executor( + None, + lambda: comprehensive_web_search( + query, + max_pages=max_pages, + time_filter=time_filter, + return_sources=True, + ), ), - ), - timeout=30, - ) + timeout=30, + ) + except asyncio.TimeoutError: + return { + "error": f"web_search timed out after 30s: {query[:200]}", + "exit_code": 1, + } + except Exception as e: + return { + "error": f"web_search failed: {type(e).__name__}: {str(e) or 'no details'}", + "exit_code": 1, + } + if progress_cb: + await progress_cb({ + "elapsed_s": 30, + "tail": "Search completed; preparing sources.", + }) output = text[:MAX_OUTPUT_CHARS] if len(text) > MAX_OUTPUT_CHARS else text if sources: output += "\n\n" diff --git a/src/builtin_actions.py b/src/builtin_actions.py index a598cb652..bf4ddd950 100644 --- a/src/builtin_actions.py +++ b/src/builtin_actions.py @@ -76,8 +76,7 @@ async def action_consolidate_memory(owner: str, **kwargs) -> Tuple[str, bool]: import json import re from src.constants import DATA_DIR - from src.endpoint_resolver import resolve_endpoint - from src.llm_core import llm_call_async + from src.llm_core import llm_call_async_with_fallback from src.memory import MemoryManager manager = MemoryManager(DATA_DIR) @@ -116,10 +115,9 @@ async def action_consolidate_memory(owner: str, **kwargs) -> Tuple[str, bool]: if len(group_memories) < 2: return False - url, model, headers = resolve_endpoint("utility", owner=group_owner or None) - if not url or not model: - url, model, headers = resolve_endpoint("default", owner=group_owner or None) - if not url or not model: + from src.task_endpoint import resolve_task_candidates + candidates = resolve_task_candidates(owner=group_owner or None) + if not candidates: return False try: @@ -147,13 +145,11 @@ async def action_consolidate_memory(owner: str, **kwargs) -> Tuple[str, bool]: "\"drop\":[{\"id\":\"existing id\",\"reason\":\"short reason\"}]}\n\n" f"MEMORIES:\n{json.dumps(items, ensure_ascii=False)}" ) - raw = await llm_call_async( - url=url, - model=model, + raw = await llm_call_async_with_fallback( + candidates, messages=[{"role": "user", "content": prompt}], temperature=0.0, max_tokens=4096, - headers=headers, timeout=120, ) from src.text_helpers import strip_think @@ -604,8 +600,7 @@ async def action_classify_events(owner: str, **kwargs) -> Tuple[str, bool]: try: from datetime import timedelta from core.database import SessionLocal, CalendarEvent - from src.endpoint_resolver import resolve_endpoint - from src.llm_core import llm_call_async + from src.llm_core import llm_call_async_with_fallback import re as _re, json as _json db = SessionLocal() @@ -620,10 +615,9 @@ async def action_classify_events(owner: str, **kwargs) -> Tuple[str, bool]: if not events: return "No upcoming events to classify", True - llm_url, llm_model, llm_headers = resolve_endpoint("utility", owner=owner) - if not llm_url: - llm_url, llm_model, llm_headers = resolve_endpoint("default", owner=owner) - llm_available = bool(llm_url and llm_model) + from src.task_endpoint import resolve_task_candidates + llm_candidates = resolve_task_candidates(owner=owner) + llm_available = bool(llm_candidates) # Pull user memories so the LLM has personal context (relationships, # job, hobbies). Helps it know e.g. " is your spouse" so their @@ -699,11 +693,11 @@ async def action_classify_events(owner: str, **kwargs) -> Tuple[str, bool]: f"EVENTS: {_json.dumps(items)}" ) try: - raw = await llm_call_async( - url=llm_url, model=llm_model, + raw = await llm_call_async_with_fallback( + llm_candidates, messages=[{"role": "user", "content": prompt}], temperature=0.1, max_tokens=16384, - headers=llm_headers, timeout=180, + timeout=180, ) from src.text_helpers import strip_think as _st raw = _st(raw or "", prose=False, prompt_echo=False) @@ -810,8 +804,7 @@ async def action_learn_sender_signatures(owner: str, **kwargs) -> Tuple[str, boo import asyncio as _aio from datetime import datetime as _dt, timedelta as _td from routes.email_helpers import _email_cache_owner_clause, _imap_connect, SCHEDULED_DB - from src.endpoint_resolver import resolve_endpoint - from src.llm_core import llm_call_async + from src.llm_core import llm_call_async_with_fallback # 1. Pull recent UIDs + From headers cheaply (header-only fetch). def _pull_headers(): @@ -891,11 +884,11 @@ async def action_learn_sender_signatures(owner: str, **kwargs) -> Tuple[str, boo if not eligible: return "All sender sigs already cached (or no eligible senders)", True - url, model, headers = resolve_endpoint("utility", owner=owner) - if not url or not model: - url, model, headers = resolve_endpoint("default", owner=owner) - if not url or not model: + from src.task_endpoint import resolve_task_candidates + candidates = resolve_task_candidates(owner=owner) + if not candidates: return "No LLM endpoint available", False + model = candidates[0][1] analyzed = 0 no_sig = 0 @@ -949,11 +942,11 @@ async def action_learn_sender_signatures(owner: str, **kwargs) -> Tuple[str, boo ) try: - raw = await llm_call_async( - url=url, model=model, + raw = await llm_call_async_with_fallback( + candidates, messages=[{"role": "user", "content": prompt}], temperature=0.0, max_tokens=600, - headers=headers, timeout=60, + timeout=60, ) from src.text_helpers import strip_think as _st sig = _st(raw or "", prose=False, prompt_echo=False).strip() @@ -1137,7 +1130,6 @@ async def action_test_skills(owner: str, **kwargs) -> Tuple[str, bool]: from services.memory.skills import SkillsManager from src.constants import DATA_DIR from routes.skills_routes import _run_skill_test_once, _skill_test_task - from src.endpoint_resolver import resolve_endpoint # #3 SCOPE GUARD: refuse to run on a None/empty owner — otherwise # `sm.load(owner=None)` returns every user's skills and we'd cross- @@ -1152,27 +1144,40 @@ async def action_test_skills(owner: str, **kwargs) -> Tuple[str, bool]: if not names: raise TaskNoop("no skills to test") - url, model, headers = resolve_endpoint("default", owner=owner) - if not url or not model: + from src.task_endpoint import resolve_task_candidates + candidates = resolve_task_candidates(owner=owner) + if not candidates: return "No Default/Utility model configured — set one in Settings.", False # #2 NO SILENT MODEL SWAP: if the configured model isn't served by the # endpoint, try a basename match — but fail loudly instead of grabbing # `avail[0]` which could be an embedding-only model and produce 36 # garbage transcripts → 36 'unknown' verdicts with no hint why. + url, model, headers = candidates[0] try: from src.llm_core import list_model_ids - avail = list_model_ids(url, headers=headers) - if avail and model not in avail: - import os as _os - base = _os.path.basename((model or "").rstrip("/")) - m = next((a for a in avail if _os.path.basename(a.rstrip("/")) == base), None) - if m: - model = m - else: - return (f"Default model '{model}' not served by endpoint {url}. " - f"Available: {', '.join(avail[:8])}{'…' if len(avail) > 8 else ''}. " - "Set a valid Default model in Settings."), False + import os as _os + + selected = None + mismatch_notes = [] + for cand_url, cand_model, cand_headers in candidates: + avail = list_model_ids(cand_url, headers=cand_headers) + if not avail or cand_model in avail: + selected = (cand_url, cand_model, cand_headers) + break + base = _os.path.basename((cand_model or "").rstrip("/")) + matched = next((a for a in avail if _os.path.basename(a.rstrip("/")) == base), None) + if matched: + selected = (cand_url, matched, cand_headers) + break + mismatch_notes.append( + f"{cand_model} not served by {cand_url}; available: " + f"{', '.join(avail[:8])}{'...' if len(avail) > 8 else ''}" + ) + if selected: + url, model, headers = selected + elif mismatch_notes: + return "No configured task fallback model is served. " + " | ".join(mismatch_notes[:3]), False except Exception as _e: logger.warning(f"test_skills model resolve check failed (continuing): {_e}") @@ -1483,7 +1488,6 @@ async def action_check_email_urgency(owner: str, **kwargs) -> Tuple[str, bool]: from pathlib import Path as _P from core.database import SessionLocal as _SL, EmailAccount as _EA from routes.email_helpers import _imap_connect, _decode_header - from src.endpoint_resolver import resolve_endpoint, resolve_utility_fallback_candidates from src.llm_core import llm_call_async_with_fallback # Per-owner state file so multi-user runs don't clobber each other's @@ -1505,12 +1509,10 @@ async def action_check_email_urgency(owner: str, **kwargs) -> Tuple[str, bool]: # ── 1. Resolve LLM candidates (utility primary + utility fallbacks; fall # through to default chat as a last resort). - url, model, headers = resolve_endpoint("utility", owner=owner) - if not url or not model: - url, model, headers = resolve_endpoint("default", owner=owner) - if not url or not model: + from src.task_endpoint import resolve_task_candidates + candidates = resolve_task_candidates(owner=owner) + if not candidates: return "No LLM endpoint available", False - candidates = [(url, model, headers)] + resolve_utility_fallback_candidates(owner=owner) # ── 2. Enumerate enabled accounts. Match this task's owner AND fall # back to the legacy "unowned account whose imap_user / from_address diff --git a/src/constants.py b/src/constants.py index b76f5d97b..eceeb6eb0 100644 --- a/src/constants.py +++ b/src/constants.py @@ -4,7 +4,7 @@ import os from src.runtime_paths import get_app_root, get_default_data_dir -APP_VERSION = "1.0.0" +APP_VERSION = "1.0.1" # Base paths BASE_DIR = os.path.join(get_app_root(), "") diff --git a/src/endpoint_resolver.py b/src/endpoint_resolver.py index 83ba1ce92..57361c673 100644 --- a/src/endpoint_resolver.py +++ b/src/endpoint_resolver.py @@ -424,6 +424,9 @@ def resolve_utility_fallback_candidates(owner: Optional[str] = None) -> list: settings = load_settings() utility_ep = (get_user_setting("utility_endpoint_id", owner or "", settings.get("utility_endpoint_id", "")) or "").strip() if not utility_ep: + utility_chain = get_user_setting("utility_model_fallbacks", owner or "", settings.get("utility_model_fallbacks") or []) or [] + if utility_chain: + return _resolve_fallback_candidates("utility_model_fallbacks", owner=owner) return _resolve_fallback_candidates("default_model_fallbacks", owner=owner) except Exception: pass diff --git a/src/llm_core.py b/src/llm_core.py index e1f732b9a..20a4b544c 100644 --- a/src/llm_core.py +++ b/src/llm_core.py @@ -907,7 +907,10 @@ def _anthropic_rejects_temperature(model: str) -> bool: return (int(match.group(1)), int(match.group(2))) >= (4, 7) # Models that support structured thinking — may output without opening tag -_THINKING_MODEL_PATTERNS = ("qwen3", "qwq", "deepseek-r1", "deepseek-reasoner", "minimax", "m2-reap", "gemma") +_THINKING_MODEL_PATTERNS = ( + "qwen3", "qwq", "deepseek-r1", "deepseek-reasoner", "minimax", + "m2-reap", "gemma", "stepfun", "step-3", "step3", +) def _supports_thinking(model: str) -> bool: """Check if model supports structured thinking output.""" @@ -2135,6 +2138,8 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl yield _stream_delta_event(reasoning, thinking=True) content = delta.get("content") or "" if content: + content = re.sub(r"]*)?>", r"", content, flags=re.IGNORECASE) + content = re.sub(r"", "", content, flags=re.IGNORECASE) stripped = content.lstrip() # gpt-oss harmony format (<|channel|>analysis/final): route via the harmony # stream router. Sticky once the first marker appears — distinct from the diff --git a/src/task_endpoint.py b/src/task_endpoint.py index 6e477a3ec..6f9a27c09 100644 --- a/src/task_endpoint.py +++ b/src/task_endpoint.py @@ -1,6 +1,11 @@ -"""Shared resolver for background-task AI endpoint (auto-naming, memory, sorting).""" +"""Shared resolver for background-task AI endpoints.""" -from src.endpoint_resolver import resolve_endpoint +from src.endpoint_resolver import ( + resolve_chat_fallback_candidates, + resolve_endpoint, + resolve_utility_fallback_candidates, +) +from src.llm_core import llm_call_async_with_fallback def resolve_task_endpoint(fallback_url=None, fallback_model=None, fallback_headers=None, owner=None): @@ -11,3 +16,60 @@ def resolve_task_endpoint(fallback_url=None, fallback_model=None, fallback_heade endpoint cannot be resolved. """ return resolve_endpoint("task", fallback_url, fallback_model, fallback_headers, owner=owner) + + +def resolve_task_candidates( + fallback_url=None, + fallback_model=None, + fallback_headers=None, + owner=None, +): + """Return ordered background-task LLM candidates. + + Order: + 1. configured Background Tasks endpoint/model, or caller fallback + 2. Utility endpoint/model + 3. Default endpoint/model + 4. Utility fallback chain + 5. Default fallback chain + """ + candidates = [] + + def _append(url, model, headers): + if not url or not model: + return + key = (url, model) + if any((u, m) == key for u, m, _ in candidates): + return + candidates.append((url, model, headers or {})) + + _append(*resolve_task_endpoint(fallback_url, fallback_model, fallback_headers, owner=owner)) + _append(*resolve_endpoint("utility", owner=owner)) + _append(*resolve_endpoint("default", owner=owner)) + for url, model, headers in resolve_utility_fallback_candidates(owner=owner): + _append(url, model, headers) + for url, model, headers in resolve_chat_fallback_candidates(owner=owner): + _append(url, model, headers) + + return candidates + + +async def task_llm_call_async( + messages, + *, + fallback_url=None, + fallback_model=None, + fallback_headers=None, + owner=None, + **kwargs, +): + """Call the shared background-task LLM candidate chain.""" + candidates = resolve_task_candidates( + fallback_url=fallback_url, + fallback_model=fallback_model, + fallback_headers=fallback_headers, + owner=owner, + ) + if not candidates: + raise RuntimeError("No LLM endpoint available for background task") + return await llm_call_async_with_fallback(candidates, messages=messages, **kwargs) diff --git a/src/task_scheduler.py b/src/task_scheduler.py index 2b33a8159..e5389df99 100644 --- a/src/task_scheduler.py +++ b/src/task_scheduler.py @@ -886,6 +886,14 @@ class TaskScheduler: owner=task.owner, body=run.result if output == "notification" else None, ) + elif run.status == "error": + self.add_notification( + task.name, + "error", + task_id, + owner=task.owner, + body=run.error or run.result, + ) # Log result to the assistant chat so all task activity is visible. # Skip skipped/error rows — user shouldn't see "skipped: …" noise @@ -1468,12 +1476,18 @@ class TaskScheduler: ) except Exception as e: logger.warning(f"Agent loop failed for task '{task.name}', falling back to simple call: {e}") - from src.llm_core import llm_call_async + from src.task_endpoint import task_llm_call_async messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": task.prompt}, ] - result = await llm_call_async(url=endpoint_url, model=model, messages=messages, timeout=120) + result = await task_llm_call_async( + messages, + fallback_url=endpoint_url, + fallback_model=model, + owner=task.owner, + timeout=120, + ) # Strip the model's chain-of-thought before saving/delivering. Task # output is LLM-only, so prose=True (which also removes untagged @@ -1698,13 +1712,17 @@ class TaskScheduler: # Honor per-task max_steps (defense against runaway agent loops). # Falls back to 20 if not set — the historical default. _task_max_rounds = task.max_steps if task.max_steps and task.max_steps > 0 else 20 - # Tasks are background workloads — they share the Utility model's - # fallback chain (Settings → Utility Model → Fallbacks). A downed - # primary endpoint won't silently yield `(no output)` — same recipe - # chat uses but with the utility list (`utility_model_fallbacks`). + # Tasks are background workloads: use the shared task fallback chain + # behind the primary endpoint so a downed primary won't silently yield + # `(no output)`. try: - from src.endpoint_resolver import resolve_utility_fallback_candidates - _task_fallbacks = resolve_utility_fallback_candidates(owner=task.owner or None) + from src.task_endpoint import resolve_task_candidates + _task_fallbacks = resolve_task_candidates( + fallback_url=endpoint_url, + fallback_model=model, + fallback_headers=headers, + owner=task.owner or None, + )[1:] except Exception: _task_fallbacks = [] async for event_str in stream_agent_loop( @@ -1741,21 +1759,22 @@ class TaskScheduler: # asking it to summarize what it did. Guarantees output. if not full_text.strip(): try: - from src.llm_core import llm_call_async_with_fallback - from src.endpoint_resolver import resolve_utility_fallback_candidates + from src.task_endpoint import task_llm_call_async grace_context = "You ran out of steps. " if tool_results: grace_context += "Here's what your tools returned:\n" + "\n".join(tool_results[-5:]) else: grace_context += "No tool results were captured." grace_context += "\n\nSummarize what you accomplished and what's still pending. Be concise." - _grace_candidates = [(endpoint_url, model, headers)] + resolve_utility_fallback_candidates(owner=task.owner or None) - full_text = await llm_call_async_with_fallback( - _grace_candidates, + full_text = await task_llm_call_async( messages=[ {"role": "system", "content": system_content}, {"role": "user", "content": grace_context}, ], + fallback_url=endpoint_url, + fallback_model=model, + fallback_headers=headers, + owner=task.owner or None, timeout=30, ) full_text = (full_text or "").strip() diff --git a/src/tool_implementations.py b/src/tool_implementations.py index a16e93e63..3ce3ee613 100644 --- a/src/tool_implementations.py +++ b/src/tool_implementations.py @@ -1268,8 +1268,8 @@ async def do_manage_settings(content: str, owner: Optional[str] = None) -> Dict: _ALIASES = { "shell": ["bash"], "terminal": ["bash"], - "search": ["web_search"], - "web": ["web_search"], + "search": ["web_search", "web_fetch"], + "web": ["web_search", "web_fetch"], "browser": ["builtin_browser"], "documents": ["create_document", "edit_document", "update_document", "suggest_document"], "doc": ["create_document", "edit_document", "update_document", "suggest_document"], @@ -1281,7 +1281,7 @@ async def do_manage_settings(content: str, owner: Optional[str] = None) -> Dict: "notes": ["manage_notes"], "calendar": ["manage_calendar"], "email": ["mcp__email__list_emails", "mcp__email__read_email", "mcp__email__send_email"], - "research": ["web_search"], # research is a per-request flag, not a tool — closest analog + "research": ["web_search", "web_fetch"], # research is a per-request flag, not a tool — closest analog } if action == "list_tools": @@ -2863,13 +2863,25 @@ async def do_serve_model(content: str, owner: Optional[str] = None) -> Dict: endpoint_added=endpoint_added, endpoint_id=endpoint_id or "", ) note = "" if registered else " (state-write failed — task may not show in UI)" + where = host or "local" + log_path = f"/tmp/odysseus-tmux/{sid}.log" return { - "output": f"Serving {repo_id} (session: {sid}){note}", + "output": ( + f"Serving {repo_id} on {where} (session: {sid}){note}\n" + f"Next required check: call list_served_models. If this task is not ready, " + f"call tail_serve_output with session_id={sid} and tail=400 before answering. " + f"Do not tell the user to check logs; you have the log tool." + ), "session_id": sid, "task_type": "serve", "phase": "running", "host": host, "endpoint_id": endpoint_id, + "log_path": log_path, + "next_tools": [ + {"name": "list_served_models", "arguments": {}}, + {"name": "tail_serve_output", "arguments": {"session_id": sid, "tail": 400}}, + ], "exit_code": 0, } # FastAPI HTTPException puts the message under `detail`, not `error`. @@ -3216,8 +3228,17 @@ async def do_tail_serve_output(content: str, owner: Optional[str] = None) -> Dic MAX_CHARS = 8000 if len(output_text) > MAX_CHARS: output_text = "…(earlier output truncated)…\n" + output_text[-MAX_CHARS:] + if not output_text: + output_text = ( + f"No log output captured yet for {session_id} on {host_label}. " + "This usually means the tmux wrapper has started but the model process " + "has not printed anything yet. Do not stop here: call list_served_models " + "again to check whether it is still loading, ready, or crashed; if it is " + "still not ready, call tail_serve_output again with a larger tail after " + "the next status check." + ) return { - "output": output_text or "(empty pane)", + "output": output_text, "session_id": session_id, "host": host_label, "tail_lines": tail, diff --git a/src/tool_parsing.py b/src/tool_parsing.py index c9548cce9..4b19d9236 100644 --- a/src/tool_parsing.py +++ b/src/tool_parsing.py @@ -39,6 +39,10 @@ _XML_TOOL_CALL_RE = re.compile( r"<(?:[\w]+:)?(?:tool_call|function_call)>\s*([\s\S]*?)", re.IGNORECASE, ) +_XML_OPEN_TOOL_CALL_RE = re.compile( + r"<(?:[\w]+:)?(?:tool_call|function_call)>\s*([\s\S]*)\Z", + re.IGNORECASE, +) _XML_INVOKE_RE = re.compile( r'\s*([\s\S]*?)', re.IGNORECASE, @@ -47,6 +51,21 @@ _XML_PARAM_RE = re.compile( r'([\s\S]*?)', re.IGNORECASE, ) +_XML_DIRECT_TOOL_RE = re.compile( + r"<\s*([A-Za-z_][\w-]*)\s*>([\s\S]*?)", + re.IGNORECASE, +) + +# Pattern 3b: StepFun Step-3.x native tool-call tokens. The tokenizer defines: +# <|tool▁calls▁begin|> ... <|tool▁calls▁end|> +# <|tool▁call▁begin|>tool_name<|tool▁sep|>{...}<|tool▁call▁end|> +# These can leak as text through llama.cpp/Ollama-style endpoints when the +# engine does not return structured OpenAI tool_calls. +_STEPFUN_CALL_BEGIN = "<|tool▁call▁begin|>" +_STEPFUN_CALL_SEP = "<|tool▁sep|>" +_STEPFUN_CALL_END = "<|tool▁call▁end|>" +_STEPFUN_CALLS_BEGIN = "<|tool▁calls▁begin|>" +_STEPFUN_CALLS_END = "<|tool▁calls▁end|>" # Pattern 4: blocks (MiniMax-M2.5 style) # {tool => 'tool_name', args => 'value'} @@ -446,6 +465,138 @@ def _parse_xml_invoke(inv_match) -> Optional[ToolBlock]: return function_call_to_tool_block(tool_name, json.dumps(params)) +def _parse_xml_direct_tool(tool_match) -> Optional[ToolBlock]: + """Parse direct XML tool tags inside . + + Some local models emit: + query + instead of the invoke/parameter shape: + query + Keep this as an adapter to the canonical function-call converter so aliases + and per-tool argument formatting stay in one place. + """ + tool_name = tool_match.group(1).lower().replace("-", "_") + if tool_name in {"invoke", "parameter", "tool_call", "function_call"}: + return None + mapped = _TOOL_NAME_MAP.get(tool_name) or (tool_name if tool_name in TOOL_TAGS else None) + if not mapped: + return None + body = tool_match.group(2).strip() + if not body: + return None + try: + params = json.loads(body) + if not isinstance(params, dict): + params = {} + except json.JSONDecodeError: + if mapped == "web_search": + params = {"query": body} + elif mapped == "web_fetch": + params = {"url": body} + elif mapped == "bash": + params = {"command": body} + elif mapped == "python": + params = {"code": body} + elif mapped in ("read_file", "write_file"): + params = {"path": body} + else: + params = {"content": body} + from src.tool_schemas import function_call_to_tool_block + return function_call_to_tool_block(mapped, json.dumps(params)) + + +def _iter_stepfun_tool_calls(text: str): + """Yield StepFun native tool-call token bodies without regex backtracking.""" + pos = 0 + while True: + start = text.find(_STEPFUN_CALL_BEGIN, pos) + if start < 0: + return + name_start = start + len(_STEPFUN_CALL_BEGIN) + sep = text.find(_STEPFUN_CALL_SEP, name_start) + if sep < 0: + return + end = text.find(_STEPFUN_CALL_END, sep + len(_STEPFUN_CALL_SEP)) + if end < 0: + return + raw_name = text[name_start:sep].strip() + body = text[sep + len(_STEPFUN_CALL_SEP):end].strip() + if raw_name and len(raw_name) <= 128: + yield raw_name, body + pos = end + len(_STEPFUN_CALL_END) + + +def _strip_stepfun_tool_markup(text: str) -> str: + """Remove StepFun tool-call token blocks and wrappers using literal scans.""" + out = [] + pos = 0 + while True: + start = text.find(_STEPFUN_CALL_BEGIN, pos) + if start < 0: + out.append(text[pos:]) + break + end = text.find(_STEPFUN_CALL_END, start + len(_STEPFUN_CALL_BEGIN)) + if end < 0: + out.append(text[pos:]) + break + out.append(text[pos:start]) + pos = end + len(_STEPFUN_CALL_END) + cleaned = "".join(out) + return cleaned.replace(_STEPFUN_CALLS_BEGIN, "").replace(_STEPFUN_CALLS_END, "") + + +def _strip_bare_invoke_markup(text: str) -> str: + """Remove bare ... blocks without regex backtracking.""" + out = [] + pos = 0 + while True: + start = text.lower().find("", start) + if tag_end < 0: + out.append(text[pos:]) + break + close = text.lower().find("", tag_end + 1) + if close < 0: + out.append(text[pos:]) + break + out.append(text[pos:start]) + pos = close + len("") + return "".join(out) + + +def _parse_stepfun_tool_call(tool_name: str, body: str) -> Optional[ToolBlock]: + """Parse StepFun native tool-call tokens into an Odysseus ToolBlock.""" + tool_name = tool_name.lower().replace("-", "_").replace(".", "_") + mapped = _TOOL_NAME_MAP.get(tool_name) or (tool_name if tool_name in TOOL_TAGS else None) + if not mapped: + return None + body = (body or "").strip() + if not body: + return None + try: + params = json.loads(body) + if not isinstance(params, dict): + params = {} + except json.JSONDecodeError: + if mapped == "web_search": + params = {"query": body} + elif mapped == "web_fetch": + params = {"url": body} + elif mapped == "bash": + params = {"command": body} + elif mapped == "python": + params = {"code": body} + elif mapped in ("read_file", "write_file"): + params = {"path": body} + else: + params = {"content": body} + from src.tool_schemas import function_call_to_tool_block + return function_call_to_tool_block(mapped, json.dumps(params)) + + def _parse_tool_code_block(raw: str) -> Optional[ToolBlock]: """Parse a {tool => 'name', args => '...'} block (MiniMax style).""" # Extract tool name @@ -511,8 +662,9 @@ def parse_tool_blocks(text: str, skip_fenced: bool = False) -> List[ToolBlock]: 2. [TOOL_CALL] ... [/TOOL_CALL] blocks (some models) 3. XML-style / blocks 4. blocks (MiniMax-M2.5 style) - 5. DeepSeek DSML markup (normalized to first) - 6. Non-native local model fallback: prose mentioning web_search followed by + 5. StepFun Step-3 native <|tool▁call▁begin|> tokens + 6. DeepSeek DSML markup (normalized to first) + 7. Non-native local model fallback: prose mentioning web_search followed by bare JSON args, e.g. {"query":"...", "time_filter":"week"} `skip_fenced`: when True, Pattern 1 (fenced ```bash/```python/```json code @@ -567,12 +719,38 @@ def parse_tool_blocks(text: str, skip_fenced: bool = False) -> List[ToolBlock]: # Pattern 3: XML-style / blocks if not blocks: + for tool_name, body in _iter_stepfun_tool_calls(text): + block = _parse_stepfun_tool_call(tool_name, body) + if block: + blocks.append(block) + if blocks: + return blocks # Try wrapped: ... for m in _XML_TOOL_CALL_RE.finditer(text): for inv in _XML_INVOKE_RE.finditer(m.group(1)): block = _parse_xml_invoke(inv) if block: blocks.append(block) + if not blocks: + for direct in _XML_DIRECT_TOOL_RE.finditer(m.group(1)): + block = _parse_xml_direct_tool(direct) + if block: + blocks.append(block) + # Some local models stream an opening wrapper and a + # complete inner tool tag, but forget the closing . + if not blocks: + for m in _XML_OPEN_TOOL_CALL_RE.finditer(text): + body = m.group(1) + for inv in _XML_INVOKE_RE.finditer(body): + block = _parse_xml_invoke(inv) + if block: + blocks.append(block) + if blocks: + break + for direct in _XML_DIRECT_TOOL_RE.finditer(body): + block = _parse_xml_direct_tool(direct) + if block: + blocks.append(block) # Try bare without wrapper if not blocks: for inv in _XML_INVOKE_RE.finditer(text): @@ -614,7 +792,9 @@ def strip_tool_blocks(text: str, skip_fenced: bool = False) -> str: text = _normalize_dsml(text) cleaned = text if skip_fenced else _TOOL_BLOCK_RE.sub('', text) cleaned = _TOOL_CALL_RE.sub('', cleaned) + cleaned = _strip_stepfun_tool_markup(cleaned) cleaned = _XML_TOOL_CALL_RE.sub('', cleaned) + cleaned = _XML_OPEN_TOOL_CALL_RE.sub('', cleaned) cleaned = _TOOL_CODE_RE.sub('', cleaned) if not skip_fenced: raw_web_json = _parse_raw_web_json_lookup(cleaned) @@ -622,6 +802,6 @@ def strip_tool_blocks(text: str, skip_fenced: bool = False) -> str: _, (start, end) = raw_web_json cleaned = cleaned[:start] + cleaned[end:] # Strip bare blocks not wrapped in - cleaned = re.sub(r'', '', cleaned, flags=re.DOTALL | re.IGNORECASE) + cleaned = _strip_bare_invoke_markup(cleaned) cleaned = re.sub(r'\n{3,}', '\n\n', cleaned) return cleaned.strip() diff --git a/static/icons/ollama-mark-crop.png b/static/icons/ollama-mark-crop.png new file mode 100644 index 000000000..2554b38a1 Binary files /dev/null and b/static/icons/ollama-mark-crop.png differ diff --git a/static/icons/ollama-mark.png b/static/icons/ollama-mark.png new file mode 100644 index 000000000..8cd2cf1ed Binary files /dev/null and b/static/icons/ollama-mark.png differ diff --git a/static/icons/sglang-logo.png b/static/icons/sglang-logo.png new file mode 100644 index 000000000..c13afe571 Binary files /dev/null and b/static/icons/sglang-logo.png differ diff --git a/static/icons/sglang-mark.png b/static/icons/sglang-mark.png new file mode 100644 index 000000000..3e0fe3eda Binary files /dev/null and b/static/icons/sglang-mark.png differ diff --git a/static/index.html b/static/index.html index fe22d6b9e..b20b73428 100644 --- a/static/index.html +++ b/static/index.html @@ -879,7 +879,7 @@ Library
@@ -1005,7 +1005,12 @@ '; const firstGroup = body.querySelector('.cookbook-group'); if (firstGroup) body.insertBefore(group, firstGroup); @@ -1760,16 +1922,25 @@ export function _renderRunningTab() { } // Group tasks by server - const _serverName = (host) => { - if (!host) return 'Local'; - const srv = _serverByVal(_envState.remoteServerKey || host) - || _envState.servers.find(s => s.host === host); - return srv?.name || host; + const _taskServerKey = (task) => task?.remoteServerKey || task?.remoteHost || ''; + const _serverName = (keyOrTask) => { + if (keyOrTask && typeof keyOrTask === 'object') { + const task = keyOrTask; + if (task.remoteServerName) return task.remoteServerName; + const srv = task.remoteServerKey ? _serverByVal(task.remoteServerKey) : null; + if (srv?.name) return srv.name; + if (!task.remoteHost) return 'Local'; + return (_envState.servers.find(s => s.host === task.remoteHost)?.name) || task.remoteHost; + } + const key = keyOrTask || ''; + if (!key || key === 'local') return 'Local'; + const srv = _serverByVal(key); + return srv?.name || key; }; const serverGroups = {}; for (const t of tasks) { - const key = t.remoteHost || ''; - if (!serverGroups[key]) serverGroups[key] = { name: _serverName(key), serve: [], download: [] }; + const key = _taskServerKey(t); + if (!serverGroups[key]) serverGroups[key] = { name: _serverName(t), serve: [], download: [] }; serverGroups[key][t.type === 'serve' ? 'serve' : 'download'].push(t); } @@ -1816,12 +1987,12 @@ export function _renderRunningTab() { e.stopPropagation(); // don't toggle the section collapse (was an inline onclick, blocked by CSP) const host = btn.dataset.clearServer; const allTasks = _loadTasks(); - const toRemove = allTasks.filter(t => (t.remoteHost || '') === host && _canClearTask(t)); + const toRemove = allTasks.filter(t => _taskServerKey(t) === host && _canClearTask(t)); // Bail with a clear message instead of silently doing nothing when // every task on this server is still running (nothing finished to // clear yet) — the previous behavior looked like the button was dead. if (!toRemove.length) { - const stillRunning = allTasks.filter(t => (t.remoteHost || '') === host && t.status === 'running').length; + const stillRunning = allTasks.filter(t => _taskServerKey(t) === host && t.status === 'running').length; const _msg = stillRunning ? `No finished tasks on ${_serverName(host)} — ${stillRunning} still running. Stop them first to clear.` : `No finished tasks on ${_serverName(host)}.`; @@ -1830,7 +2001,8 @@ export function _renderRunningTab() { return; } if (!await window.styledConfirm(`Clear ${toRemove.length} finished task${toRemove.length === 1 ? '' : 's'} on ${_serverName(host)}?`, { confirmText: 'Clear' })) return; - const remaining = allTasks.filter(t => (t.remoteHost || '') !== host || !_canClearTask(t)); + toRemove.forEach(t => _tombstoneTask(t.sessionId)); + const remaining = allTasks.filter(t => _taskServerKey(t) !== host || !_canClearTask(t)); _saveTasks(remaining); // Fade/slide each finished card out (same exit as the per-card clear) // instead of yanking them instantly. @@ -1864,7 +2036,7 @@ export function _renderRunningTab() { btn.addEventListener('click', async (e) => { e.stopPropagation(); // don't toggle the section collapse const host = btn.dataset.stopServer; - const running = _loadTasks().filter(t => (t.remoteHost || '') === host && t.status === 'running'); + const running = _loadTasks().filter(t => _taskServerKey(t) === host && t.status === 'running'); if (!running.length) { uiModule.showToast(`Nothing running on ${_serverName(host)}`); return; } if (!await window.styledConfirm(`Stop ${running.length} running task${running.length > 1 ? 's' : ''} on ${_serverName(host)}?`, { confirmText: 'Stop all' })) return; // Mark every task as user-stopped BEFORE firing the kills so that the @@ -1967,11 +2139,12 @@ export function _renderRunningTab() { const _bdg = _taskBadge(task); const _bdgTitle = (task._unreachable && task.status === 'running') ? ' title="Server not responding — it may have crashed"' : ''; + const displayName = _taskDisplayName(task); el.innerHTML = `
${esc((task.status === 'done' && task.type === 'download') ? 'finished' : task.type)} - ${modelLogo(task.name)}${esc(task.name)} - ${esc(_clearPillLabel(task))}clear + ${modelLogo(task.name)}${esc(displayName)} + ${_canLaunchDownloadedTask(task) ? '' : ''}${esc(_clearPillLabel(task))}clear ${esc(_bdg.text)} @@ -2043,19 +2216,11 @@ export function _renderRunningTab() { e.stopPropagation(); const repo = task.payload?.repo_id || task.name; if (!repo) { uiModule.showToast('No model info on this task'); return; } - // Point the active server at the one it downloaded to. - const _tHost = task.remoteHost || ''; - _envState.remoteHost = _tHost; - const _tSrv = _serverByVal(_envState.remoteServerKey || _tHost) - || _envState.servers.find(s => s.host === _tHost); - if (_tSrv) { _envState.env = _tSrv.env || 'none'; _envState.envPath = _tSrv.envPath || ''; _envState.platform = _tSrv.platform || ''; } - else if (!_tHost) { _envState.env = 'none'; _envState.envPath = ''; _envState.platform = ''; } - document.querySelectorAll('#hwfit-server-select, #hwfit-dl-server, #hwfit-cache-server, #hwfit-deps-server').forEach(sel => { - if (sel && sel.tagName === 'SELECT') sel.value = _tHost || 'local'; - }); + // Point the active server at the exact profile it downloaded to. + _selectTaskServer(task); try { const { openServePanelForRepo } = await import('./cookbookServe.js'); - await openServePanelForRepo(repo); + await openServePanelForRepo(repo, _downloadServeFields(task)); // Serving it supersedes the finished download — clear the card from // the Running tab (smooth exit) now that we've jumped to Serve. _animateOutThenRemove(el, task.sessionId); @@ -2177,9 +2342,6 @@ export function _renderRunningTab() { if (task.status !== 'running' && task.status !== 'queued') { items.push({ group: 'run', label: 'Reconnect tmux', action: 'reconnect' }); } - if (task.status === 'running') { - items.push({ group: 'run', label: 'Stop', action: 'stop', danger: true }); - } items.push({ group: 'run', label: 'Restart', action: 'retry' }); // ── Edit section ──────────────────────────────────────────── // Merged "Edit & relaunch" — opens the structured serve panel @@ -2539,7 +2701,7 @@ export function _renderRunningTab() { }); // Route to the right server section body - const serverBodyId = `server-body-${(task.remoteHost || 'local').replace(/[^a-zA-Z0-9-]/g, '_')}`; + const serverBodyId = `server-body-${(_taskServerKey(task) || 'local').replace(/[^a-zA-Z0-9-]/g, '_')}`; const targetBody = document.getElementById(serverBodyId); if (targetBody) targetBody.appendChild(el); else group.appendChild(el); @@ -3393,7 +3555,8 @@ function _refreshServerDots() { let tasks; try { tasks = _loadTasks(); } catch { return; } const byKey = {}; - for (const t of tasks) { (byKey[t.remoteHost || ''] = byKey[t.remoteHost || ''] || []).push(t); } + const _taskServerKeyForDot = (task) => task?.remoteServerKey || task?.remoteHost || ''; + for (const t of tasks) { (byKey[_taskServerKeyForDot(t)] = byKey[_taskServerKeyForDot(t)] || []).push(t); } document.querySelectorAll('.cookbook-section-header').forEach(header => { const dot = header.querySelector('.cookbook-srv-status'); if (!dot) return; @@ -3527,7 +3690,9 @@ async function _probeEndpointUntilOnline(epId, host, port) { try { // Hit the probe endpoint — it re-probes server-side and updates // cached_models. We consume (and discard) the SSE stream. - await fetch(`/api/model-endpoints/${epId}/probe`, { credentials: 'same-origin' }).then(r => r.text()).catch(() => {}); + const probeRes = await fetch(`/api/model-endpoints/${epId}/probe`, { credentials: 'same-origin' }).catch(() => null); + if (probeRes && probeRes.status === 404) return; + if (probeRes) await probeRes.text().catch(() => {}); const eps = await fetch('/api/model-endpoints', { credentials: 'same-origin' }).then(r => r.json()).catch(() => []); const ep = (eps || []).find(e => e.id === epId); if (ep && (ep.models || []).length) { @@ -3565,7 +3730,7 @@ async function _pollBackgroundStatus() { } } if (added > 0) { - localStorage.setItem(TASKS_KEY, JSON.stringify(merged.map(_stripTaskSecrets))); + localStorage.setItem(TASKS_KEY, JSON.stringify(merged.map(_redactTaskForStorage))); _renderRunningTab(); } } @@ -3798,6 +3963,7 @@ export function initRunning(shared) { _persistEnvState = shared._persistEnvState; _refreshDependencies = shared._refreshDependencies; _serverByVal = shared._serverByVal; + _serverKey = shared._serverKey; _selectedServer = shared._selectedServer; modelLogo = shared.modelLogo; esc = shared.esc; diff --git a/static/js/cookbookServe.js b/static/js/cookbookServe.js index 33d56ef3c..06a990b82 100644 --- a/static/js/cookbookServe.js +++ b/static/js/cookbookServe.js @@ -10,6 +10,7 @@ import { providerLogo } from './providers.js'; import { modelColor } from './chatRenderer.js'; import { bindMenuDismiss, dismissOrRemove } from './escMenuStack.js'; import { openCookbookDependencies } from './cookbook-diagnosis.js'; +import { _hwfitCache } from './cookbook-hwfit.js'; // Shared state/functions injected by init() let _envState; @@ -17,6 +18,7 @@ let _sshCmd; let _getPort; let _sshPrefix; let _serverByVal; +let _serverKey; let _getPlatform; let _isWindows; let _isMetal; @@ -40,9 +42,62 @@ let _nextAvailablePort; // Storage keys const SERVE_STATE_KEY = 'cookbook-serve-state'; +const SERVE_FAVORITES_KEY = 'cookbook-serve-favorite-models'; let _cachedAllModels = []; +function _loadServeFavorites() { + try { + const raw = JSON.parse(localStorage.getItem(SERVE_FAVORITES_KEY) || '[]'); + return new Set(Array.isArray(raw) ? raw.filter(Boolean).map(String) : []); + } catch { + return new Set(); + } +} + +function _saveServeFavorites(favorites) { + try { + localStorage.setItem(SERVE_FAVORITES_KEY, JSON.stringify(Array.from(favorites || []))); + } catch {} +} + +function _redactStoredCommand(value) { + return String(value || '') + .replace(/hf_[A-Za-z0-9]{20,}/g, '[redacted-token]') + .replace(/((?:api[_-]?key|token|authorization|password|passwd|secret)\s*[=:]\s*)(["']?)[^\s"']+/gi, '$1$2[redacted]'); +} + +function _redactServeStateForStorage(value) { + if (!value || typeof value !== 'object') return value; + if (Array.isArray(value)) return value.map(_redactServeStateForStorage); + const safe = { ...value }; + for (const key of Object.keys(safe)) { + if (/token|password|passwd|secret|api[_-]?key/i.test(key)) { + delete safe[key]; + } else if (typeof safe[key] === 'string' && /cmd|command|args|env/i.test(key)) { + safe[key] = _redactStoredCommand(safe[key]); + } else if (safe[key] && typeof safe[key] === 'object') { + safe[key] = _redactServeStateForStorage(safe[key]); + } + } + return safe; +} + +function _isServeFavorite(repo) { + return _loadServeFavorites().has(String(repo || '')); +} + +function _toggleServeFavorite(repo) { + const key = String(repo || ''); + if (!key) return false; + const favorites = _loadServeFavorites(); + const next = !favorites.has(key); + if (next) favorites.add(key); + else favorites.delete(key); + _saveServeFavorites(favorites); + return next; +} + function _repoLooksAwqLike(model, repo) { const q = String(model?.quant || '').toUpperCase(); const n = `${repo || ''} ${model?.repo_id || ''} ${model?.name || ''} ${model?.path || ''}`.toLowerCase(); @@ -52,7 +107,9 @@ function _repoLooksAwqLike(model, repo) { function _repoLooksGgufLike(model, repo) { const q = String(model?.quant || '').toUpperCase(); const n = `${repo || ''} ${model?.repo_id || ''} ${model?.name || ''} ${model?.path || ''}`.toLowerCase(); - return !!model?.is_gguf || /^Q[2-8]/.test(q) || /^IQ/.test(q) || q === 'GGUF' || n.includes('gguf'); + const hasGgufFile = Array.isArray(model?.gguf_files) + && model.gguf_files.some(f => f && typeof f.rel_path === 'string' && /\.gguf$/i.test(f.rel_path)); + return !!model?.is_gguf || hasGgufFile || /^Q[2-8]/.test(q) || /^IQ/.test(q) || q === 'GGUF' || n.includes('gguf'); } function _serveBackendWarning(model, repo, backend, fields = {}) { @@ -95,8 +152,356 @@ function _allGpuIds(count) { return Array.from({ length: Math.floor(n) }, (_, i) => String(i)).join(','); } +function _shellSplitForPreview(cmd) { + const s = String(cmd || ''); + const out = []; + let cur = ''; + let quote = ''; + let escNext = false; + for (const ch of s) { + if (escNext) { + cur += ch; + escNext = false; + continue; + } + if (ch === '\\') { + cur += ch; + escNext = true; + continue; + } + if (quote) { + cur += ch; + if (ch === quote) quote = ''; + continue; + } + if (ch === '"' || ch === "'") { + quote = ch; + cur += ch; + continue; + } + if (/\s/.test(ch)) { + if (cur) { + out.push(cur); + cur = ''; + } + continue; + } + cur += ch; + } + if (cur) out.push(cur); + return out; +} + +function _formatServeCmdPreview(cmd) { + const raw = String(cmd || ''); + if (raw.startsWith('MODEL_FILE=$({')) { + const marker = /&&\s+([A-Za-z_][A-Za-z0-9_]*=\S+\s+)*(?:[A-Za-z_][A-Za-z0-9_]*=\S+\s+)?(?:llama-server|python3?\s+-m\s+llama_cpp\.server)\b/; + const match = raw.match(marker); + if (match && match.index > 0) { + const prelude = raw.slice(0, match.index).replace(/\s+/g, ' ').trim(); + const rest = raw.slice(match.index).replace(/^\s*&&\s*/, ''); + return `${prelude}\n&&\n${_formatServeCmdPreview(rest)}`; + } + } + const tokens = _shellSplitForPreview(cmd); + if (tokens.length <= 4) return String(cmd || ''); + const lines = []; + let i = 0; + while (i < tokens.length && /^[A-Za-z_][A-Za-z0-9_]*=/.test(tokens[i])) { + lines.push(tokens[i]); + i++; + } + if (tokens[i]) { + const head = [tokens[i++]]; + if (tokens[i] && !tokens[i].startsWith('--') && !/^[A-Za-z_][A-Za-z0-9_]*=/.test(tokens[i])) head.push(tokens[i++]); + if (tokens[i] && !tokens[i].startsWith('--') && !/^[A-Za-z_][A-Za-z0-9_]*=/.test(tokens[i])) head.push(tokens[i++]); + lines.push(head.join(' ')); + } + while (i < tokens.length) { + const t = tokens[i++]; + if (t.startsWith('--')) { + const vals = []; + while (i < tokens.length && !tokens[i].startsWith('--') && !/^[A-Za-z_][A-Za-z0-9_]*=/.test(tokens[i])) { + vals.push(tokens[i++]); + } + lines.push([t, ...vals].join(' ')); + } else { + lines.push(t); + } + } + return lines.join('\n'); +} + +function _normalizeServeCmdForLaunch(cmd) { + return String(cmd || '') + .replace(/MODEL_FILE=\$\(\{\s+/g, 'MODEL_FILE=$({ ') + .replace(/\s+\}\s+\|\s+head\s+-1\)/g, ' } | head -1)') + .replace(/\s*;\s*/g, '; ') + .replace(/\s*\|\|\s*/g, ' __ODY_OR__ ') + .replace(/\s*\|\s*/g, ' | ') + .replace(/\s+__ODY_OR__\s+/g, ' || ') + .replace(/\s+/g, ' ') + .trim(); +} + +function _modelSizeGb(model, explicitGb = 0) { + const explicit = Number(explicitGb || 0); + if (Number.isFinite(explicit) && explicit > 0) return explicit; + const bytes = Number(model?.size_bytes || 0); + if (Number.isFinite(bytes) && bytes > 0) return bytes / (1024 ** 3); + const gb = Number( + model?.size_gb + || model?.required_gb + || model?.vram_needed + || model?.min_vram_gb + || model?.recommended_ram_gb + || model?.min_ram_gb + || 0 + ); + if (Number.isFinite(gb) && gb > 0) return gb; + if (_isMiniMaxM3Model(model)) return 240; + return 0; +} + +function _parseParamsB(text) { + const s = String(text || ''); + const m = s.match(/(\d+(?:\.\d+)?)\s*([bBmMtT])\b/); + if (!m) return 0; + const n = parseFloat(m[1]); + if (!Number.isFinite(n) || n <= 0) return 0; + const unit = m[2].toLowerCase(); + if (unit === 't') return n * 1000; + if (unit === 'b') return n; + if (unit === 'm') return n / 1000; + return 0; +} + +function _knownModelContextMax(model) { + if (_isMiniMaxM3Model(model)) return 1048576; + return 0; +} + +function _modelIdentityText(model) { + return [ + model?.repo_id, + model?.quant_repo, + model?.name, + model?.id, + model?.path, + model?.model_path, + model?.served_model_name, + model?.quant, + model?.format, + ].filter(Boolean).join(' ').toLowerCase(); +} + +function _isMiniMaxM3Model(model) { + const name = _modelIdentityText(model); + return ( + (/minimax/.test(name) && /\bm3\b/.test(name)) + || /minimax-m3/.test(name) + || /models--cyankiwi--minimax-m3-awq-int4/.test(name) + || /cyankiwi\/minimax-m3-awq-int4/.test(name) + ); +} + +function _isMiniMaxM2Model(model) { + const name = _modelIdentityText(model); + return /minimax/.test(name) && /\bm2(?:\.\d+)?\b/.test(name); +} + +function _modelContextMaxForServe(model, explicitMax) { + const explicit = Number(explicitMax || 0); + if (Number.isFinite(explicit) && explicit > 0) return explicit; + const known = _knownModelContextMax(model); + if (known > 0) return known; + for (const key of ['context_length', 'max_position_embeddings', 'n_ctx_train', 'model_max_length', 'max_seq_len']) { + const value = Number(model?.[key] || 0); + if (Number.isFinite(value) && value > 0) return value; + } + const catalogCtx = Number(model?.context || 0); + if (Number.isFinite(catalogCtx) && catalogCtx > 0) return catalogCtx; + return 131072; +} + +function _estimateVllmContextFit(model, fields, modelCtxMax, modelWeightsGb = 0, fitSystem = null) { + const sys = fitSystem || _hwfitCache?.system || {}; + const isMiniMaxM3 = _isMiniMaxM3Model(model); + const gpuIds = String(fields.gpus || '').split(',').map(s => parseInt(s.trim(), 10)).filter(Number.isFinite); + const tp = Math.max(1, parseInt(fields.tp, 10) || gpuIds.length || 1); + const selectedCount = Math.max(1, gpuIds.length || tp); + const groups = Array.isArray(sys.gpu_groups) ? sys.gpu_groups : []; + const activeGroup = sys.active_group || groups[0] || null; + const perGpuGb = Number(activeGroup?.vram_each) + || (Number(sys.gpu_vram_gb) / Math.max(1, Number(sys.gpu_count) || selectedCount)) + || 0; + if (!perGpuGb) { + return { needsHardwareScan: true, reason: 'scan hardware first to estimate context from VRAM' }; + } + + const gpuUtil = Math.min(0.99, Math.max(0.1, parseFloat(fields.gpu_mem) || 0.90)); + const budgetGb = perGpuGb * selectedCount * gpuUtil; + const modelGb = _modelSizeGb(model, modelWeightsGb); + if (!modelGb) return { needsModelSize: true, reason: 'model weight size unknown; scan model files or enter context manually' }; + const modelMax = Math.max(1024, _modelContextMaxForServe(model, modelCtxMax)); + + if (isMiniMaxM3) { + const perGpuBudgetGb = perGpuGb * gpuUtil; + const modelShardGb = modelGb / Math.max(1, tp); + const fixedOverheadGb = Math.max(1.5, perGpuBudgetGb * 0.035); + const freeForKv = perGpuBudgetGb - modelShardGb - fixedOverheadGb; + const kvGbPerToken = (29.25 / 1048576) * (String(fields.vllm_kv_cache_dtype || '').toLowerCase() === 'fp8' ? 1 : 1.8); + if (freeForKv <= 0) { + return { + ctx: 1024, + budgetGb, + modelGb, + kvGbPerToken, + reason: `model shard ${modelShardGb.toFixed(1)}G exceeds per-GPU usable ${perGpuBudgetGb.toFixed(1)}G before KV`, + }; + } + const raw = Math.floor((freeForKv / kvGbPerToken) * 0.99); + const rounded = Math.max(1024, Math.floor(raw / 128) * 128); + const ctx = Math.min(modelMax, rounded); + return { + ctx, + budgetGb, + modelGb, + kvGbPerToken, + reason: `~${ctx.toLocaleString()} tokens fits per-GPU KV (${freeForKv.toFixed(1)}G free)`, + }; + } + + const name = `${model?.repo_id || ''} ${model?.name || ''} ${model?.quant || ''}`; + const lower = name.toLowerCase(); + const isMoE = /\bmoe\b|a\d+b|minimax|deepseek|mixtral|kimi-k2|glm-4\.5/.test(lower); + const totalParams = _parseParamsB(name) || Math.max(1, modelGb / 0.58); + const activeFromName = (() => { + const m = lower.match(/\ba(\d+(?:\.\d+)?)b\b/); + return m ? parseFloat(m[1]) : 0; + })(); + const activeParams = activeFromName || (isMoE ? Math.min(totalParams, 32) : totalParams); + const effectiveActiveParams = (/minimax/.test(lower) && /\bm3\b/.test(lower)) ? 23 : activeParams; + const kvDtype = String(fields.vllm_kv_cache_dtype || '').toLowerCase(); + const kvFactor = kvDtype === 'fp8' ? 0.55 : 1; + const kvGbPerTokenTotal = Math.max(0.00002, 0.000008 * effectiveActiveParams * kvFactor); + const kvGbPerToken = kvGbPerTokenTotal / Math.max(1, tp); + const perGpuBudgetGb = perGpuGb * gpuUtil; + const modelShardGb = modelGb / Math.max(1, tp); + const fixedOverheadGb = Math.max(1.5, perGpuBudgetGb * 0.035); + const freeForKv = perGpuBudgetGb - modelShardGb - fixedOverheadGb; + if (freeForKv <= 0) { + return { + ctx: 1024, + budgetGb, + modelGb, + kvGbPerToken, + reason: `model shard ${modelShardGb.toFixed(1)}G exceeds per-GPU usable ${perGpuBudgetGb.toFixed(1)}G before KV`, + }; + } + const raw = Math.floor(freeForKv / kvGbPerToken); + const rounded = Math.max(1024, Math.floor(raw / 1024) * 1024); + const ctx = Math.min(modelMax, rounded); + return { + ctx, + budgetGb, + modelGb, + kvGbPerToken, + reason: `~${ctx.toLocaleString()} tokens fits per-GPU KV (${freeForKv.toFixed(1)}G free)`, + }; +} + +function _estimateLlamaContextFit(model, fields, modelCtxMax, modelWeightsGb = 0, fitSystem = null, profileData = null) { + const profiles = Array.isArray(profileData?.profiles) ? profileData.profiles : []; + const preferred = profiles.find(p => String(p?.key || '').toLowerCase() === 'balanced') + || profiles.find(p => Number(p?.ctx) > 0) + || null; + const modelMax = Math.max(1024, _modelContextMaxForServe(model, modelCtxMax)); + if (preferred && Number(preferred.ctx) > 0) { + const ctx = Math.min(modelMax, Number(preferred.ctx)); + return { + ctx, + reason: `profile ${preferred.label || preferred.key || 'fit'} fits scanned hardware`, + }; + } + + const sys = fitSystem || _hwfitCache?.system || {}; + const modelGb = _modelSizeGb(model, modelWeightsGb); + const backend = String(fields.backend || '').toLowerCase(); + const llamaMode = String(fields.llama_mode || '').toLowerCase(); + const isCpuMode = backend === 'llamacpp' && llamaMode === 'cpu'; + const isUnifiedMode = backend === 'llamacpp' && (llamaMode === 'unified' || fields.unified_mem); + if (!modelGb) { + return { + ctx: Math.min(modelMax, 32768), + needsModelSize: true, + reason: 'model weight size unknown; using model limit fallback', + }; + } + + if (isCpuMode) { + return { + ctx: Math.min(modelMax, 131072), + modelGb, + reason: 'CPU mode uses system RAM; capped to trained limit', + }; + } + + const gpuIds = String(fields.gpus || '').split(',').map(s => parseInt(s.trim(), 10)).filter(Number.isFinite); + const selectedCount = Math.max(1, gpuIds.length || parseInt(fields.tp, 10) || 1); + const groups = Array.isArray(sys.gpu_groups) ? sys.gpu_groups : []; + const activeGroup = sys.active_group || groups[0] || null; + const totalVramGb = Number(activeGroup?.vram_each) + ? Number(activeGroup.vram_each) * selectedCount + : (Number(sys.gpu_vram_gb) || 0); + if (!totalVramGb) { + return { + ctx: Math.min(modelMax, 32768), + modelGb, + needsHardwareScan: true, + reason: 'scan hardware first; using model limit fallback', + }; + } + + const totalRamGb = Number(sys.total_ram_gb) || 0; + const availableRamGb = Number(sys.available_ram_gb) || 0; + const unifiedPoolGb = isUnifiedMode + ? Math.max( + totalVramGb, + availableRamGb, + totalRamGb > 0 ? totalRamGb * 0.85 : 0 + ) + : totalVramGb; + const usableGb = isUnifiedMode + ? Math.max(1, unifiedPoolGb - Math.max(2.0, unifiedPoolGb * 0.08)) + : Math.max(1, totalVramGb - Math.max(1.0, selectedCount * 0.6)); + const freeForKv = usableGb - modelGb; + const kv = String(fields.cache_type || '').toLowerCase(); + const kvFactor = kv === 'q4_0' ? 0.55 : (kv === 'q8_0' ? 1 : (kv === 'f16' ? 1.9 : 1)); + const kvGbPerToken = Math.max(0.00008, (modelGb / 7.5) * 0.0007 * kvFactor); + if (freeForKv <= 0) { + return { + ctx: Math.min(modelMax, 8192), + modelGb, + kvGbPerToken, + reason: `model ${modelGb.toFixed(1)}G exceeds usable ${isUnifiedMode ? 'unified memory' : 'VRAM'} ${usableGb.toFixed(1)}G before KV`, + }; + } + const raw = Math.floor(freeForKv / kvGbPerToken); + const rounded = Math.max(1024, Math.floor(raw / 1024) * 1024); + const ctx = Math.min(modelMax, rounded); + return { + ctx, + modelGb, + kvGbPerToken, + reason: `~${ctx.toLocaleString()} tokens fits llama.cpp KV (${freeForKv.toFixed(1)}G free ${isUnifiedMode ? 'unified' : 'VRAM'})`, + }; +} + function _selectedServeTarget(panel) { - const select = document.getElementById('hwfit-server-select') || document.getElementById('hwfit-dl-server'); + const select = panel?.querySelector?.('#hwfit-server-select') + || document.getElementById('hwfit-server-select') + || document.getElementById('hwfit-dl-server'); const servers = Array.isArray(_envState.servers) ? _envState.servers : []; let host = _envState.remoteHost || ''; let server = host ? (_serverByVal?.(_envState.remoteServerKey || host) || servers.find(s => s.host === host)) : null; @@ -116,8 +521,10 @@ function _selectedServeTarget(panel) { : (server?.name || 'local server'); return { host, - port: host ? (server?.port || _getPort(host) || '') : '', + serverKey: server ? (_serverKey?.(server) || '') : (select?.value || ''), + serverName: server?.name || '', env: server?.env || '', + port: host ? (server?.port || _getPort(host) || '') : '', venv, platform: server?.platform || _envState.platform || '', label, @@ -166,6 +573,9 @@ function _runtimeNoteText(backend, pkg, target) { const label = labels[backend] || backend; if (!pkg) return `${label} readiness unavailable for ${target.label}.`; const note = pkg.status_note || pkg.update_note || ''; + if (pkg.installed === null || pkg.probe_error) { + return note ? `${label} readiness unavailable for ${target.label}: ${note}` : `${label} readiness unavailable for ${target.label}.`; + } if (pkg.installed) { return note ? `${label} ready on ${target.label}: ${note}` : `${label} ready on ${target.label}.`; } @@ -240,6 +650,13 @@ function _runnableGgufFiles(model) { return primary.length ? primary : files; } +function _selectedGgufSizeGb(model, relPath) { + const file = _runnableGgufFiles(model).find(f => f.rel_path === relPath); + const bytes = Number(file?.size_bytes || 0); + if (!Number.isFinite(bytes) || bytes <= 0) return 0; + return bytes / (1024 ** 3); +} + function _ggufFileLabel(file) { const base = (file.name || file.rel_path || '').split('/').pop(); const size = _formatGgufSize(file.size_bytes); @@ -250,6 +667,122 @@ function _ggufFileLabel(file) { return `${quant}${base}${size || split ? ` (${[size, split.replace(/^, /, '')].filter(Boolean).join(', ')})` : ''}${role}`; } +function _ggufTaskDisplayPart(model, relPath) { + const rel = String(relPath || ''); + if (!rel) return ''; + const file = _ggufFilesForModel(model).find(f => f.rel_path === rel); + if (file?.quant) return String(file.quant).toUpperCase().replace(/^UD-/, ''); + const parts = rel.split('/').filter(Boolean); + const base = parts[parts.length - 1] || ''; + const parent = parts.length > 1 ? parts[parts.length - 2] : ''; + const text = `${parent} ${base}`; + const quant = text.match(/\b(?:UD-)?(?:IQ[1-8]_[A-Z0-9]+|Q[2-8]_K_[MLS]|Q[2-8]_[0-9A-Z]+|Q[2-8])\b/i); + if (quant) return quant[0].toUpperCase().replace(/^UD-/, ''); + return base.replace(/\.gguf$/i, '').replace(/-\d{5}-of-\d{5}$/i, ''); +} + +function _serveTaskDisplayName(shortName, model, fields) { + const name = String(shortName || '').trim(); + const backend = String(fields?.backend || '').toLowerCase(); + if (backend !== 'llamacpp' && backend !== 'ollama') return name; + const part = _ggufTaskDisplayPart(model, fields?.gguf_file); + return part && !name.includes(` · ${part}`) ? `${name} · ${part}` : name; +} + +function _safeGgufRelPath(relPath) { + const rel = String(relPath || '').replace(/\\/g, '/').replace(/^\/+/, ''); + if (!rel || rel.startsWith('../') || rel.includes('/../') || rel === '..') return ''; + if (rel.includes('\0')) return ''; + return rel; +} + +function _ggufDeleteChoice(repo, files) { + return new Promise(resolve => { + let overlay = document.getElementById('cookbook-gguf-delete-overlay'); + if (!overlay) { + overlay = document.createElement('div'); + overlay.id = 'cookbook-gguf-delete-overlay'; + overlay.className = 'modal hidden'; + overlay.innerHTML = + ''; + document.body.appendChild(overlay); + } + + const safeFiles = files + .map(f => ({ ...f, rel_path: _safeGgufRelPath(f.rel_path) })) + .filter(f => f.rel_path); + const msg = overlay.querySelector('#cookbook-gguf-delete-msg'); + const list = overlay.querySelector('#cookbook-gguf-delete-list'); + const cancelBtn = overlay.querySelector('#cookbook-gguf-delete-cancel'); + const repoBtn = overlay.querySelector('#cookbook-gguf-delete-repo'); + const selectedBtn = overlay.querySelector('#cookbook-gguf-delete-selected'); + const prevFocus = document.activeElement; + + msg.textContent = `${repo} has multiple GGUF files. Pick what to delete.`; + list.innerHTML = safeFiles.map((file, idx) => { + const label = esc ? esc(_ggufFileLabel(file)) : _ggufFileLabel(file); + const rel = esc ? esc(file.rel_path) : file.rel_path; + return ``; + }).join(''); + + function cleanup(result) { + overlay.classList.add('hidden'); + overlay.style.display = 'none'; + cancelBtn.removeEventListener('click', onCancel); + repoBtn.removeEventListener('click', onRepo); + selectedBtn.removeEventListener('click', onSelected); + overlay.removeEventListener('click', onBackdrop); + document.removeEventListener('keydown', onKey); + try { prevFocus && prevFocus.focus && prevFocus.focus(); } catch {} + resolve(result); + } + function onCancel() { cleanup(null); } + function onRepo() { cleanup({ mode: 'repo' }); } + function onSelected() { + const selected = [...list.querySelectorAll('input[type="checkbox"]:checked')] + .map(input => safeFiles[Number(input.value)]) + .filter(Boolean); + if (!selected.length) { + uiModule.showToast?.('Select at least one GGUF file.'); + return; + } + cleanup({ mode: 'files', files: selected }); + } + function onBackdrop(e) { if (e.target === overlay) cleanup(null); } + function onKey(e) { + if (e.key === 'Escape') { + e.preventDefault(); + e.stopPropagation(); + cleanup(null); + } + } + + cancelBtn.addEventListener('click', onCancel); + repoBtn.addEventListener('click', onRepo); + selectedBtn.addEventListener('click', onSelected); + overlay.addEventListener('click', onBackdrop); + document.addEventListener('keydown', onKey); + overlay.classList.remove('hidden'); + overlay.style.display = ''; + selectedBtn.focus(); + }); +} + function _shellPathExpr(path) { const s = String(path || ''); if (s === '~') return '${HOME}'; @@ -295,6 +828,12 @@ function _rerenderCachedModels() { else if (sortVal === 'size-desc') allModels.sort((a, b) => _parseSize(b.size) - _parseSize(a.size)); else if (sortVal === 'size-asc') allModels.sort((a, b) => _parseSize(a.size) - _parseSize(b.size)); else if (sortVal === 'recent') allModels.sort((a, b) => (b.mtime || 0) - (a.mtime || 0)); + const favorites = _loadServeFavorites(); + allModels.sort((a, b) => { + const af = favorites.has(String(a.repo_id || '')) ? 1 : 0; + const bf = favorites.has(String(b.repo_id || '')) ? 1 : 0; + return bf - af; + }); let html = ''; let visibleCount = 0; @@ -317,8 +856,9 @@ function _rerenderCachedModels() { // living on the same line as the model name. const _isDownloading = m.status === 'downloading'; const _isDlActive = _isDownloading ? _isActivelyDownloading(m.repo_id) : false; + const _isFavorite = favorites.has(String(m.repo_id || '')); const isSelectMode = document.getElementById('hwfit-cache-select')?.classList.contains('active'); - html += `
`; + html += `
`; html += ``; html += `
`; const _mc = modelColor(m.repo_id) || ''; @@ -328,7 +868,8 @@ function _rerenderCachedModels() { const _downloadingPill = _isDownloading ? ` ${_isDlActive ? 'downloading' : 'stalled'}` : ''; - html += `
${modelLogo(m.repo_id)}${esc(shortName)}${hfLink ? ` HF ↗` : ''}${_runningPill}${_downloadingPill}
`; + const _favoritePill = _isFavorite ? ' pinned' : ''; + html += `
${modelLogo(m.repo_id)}${esc(shortName)}${_favoritePill}${hfLink ? ` HF ↗` : ''}${_runningPill}${_downloadingPill}
`; html += `
${metaParts.join(' \u00b7 ')}
`; html += `
`; const _bk = _detectBackend(m).backend; @@ -411,7 +952,12 @@ function _rerenderCachedModels() { const _deleteIco = ''; const _selectIco = ''; const _schedIco = ''; + const _favNow = _isServeFavorite(repo); + const _favIco = _favNow + ? '' + : ''; const items = []; + items.push({ label: _favNow ? 'Unfavorite' : 'Favorite', icon: _favIco, action: 'favorite' }); if (m && m.status === 'ready') items.push({ label: 'Serve', icon: _serveIco, action: 'serve' }); if (m && m.status === 'downloading') items.push({ label: 'Retry', icon: _retryIco, action: 'retry' }); if (m && m.status === 'ready') items.push({ label: 'Schedule…', icon: _schedIco, action: 'schedule' }); @@ -424,6 +970,11 @@ function _rerenderCachedModels() { div.addEventListener('click', () => { closeDropdown(); if (opt.action === 'serve') item.click(); + else if (opt.action === 'favorite') { + const favored = _toggleServeFavorite(repo); + uiModule.showToast(favored ? 'Favorited — pinned to top' : 'Unfavorited'); + _rerenderCachedModels(); + } else if (opt.action === 'delete') _deleteCachedModel(repo, item, false, m); else if (opt.action === 'retry') _retryCachedModel(repo, m); else if (opt.action === 'schedule') { @@ -510,6 +1061,7 @@ function _rerenderCachedModels() { item.classList.remove('doclib-card-expanded'); item.style.flexDirection = ''; item.style.alignItems = ''; + item.style.maxHeight = ''; list.style.minHeight = ''; list.style.maxHeight = ''; return; @@ -523,6 +1075,7 @@ function _rerenderCachedModels() { c.classList.remove('doclib-card-expanded'); c.style.flexDirection = ''; c.style.alignItems = ''; + c.style.maxHeight = ''; }); const shortName = repo.split('/').pop(); @@ -544,17 +1097,28 @@ function _rerenderCachedModels() { const ss = (_byRepo[repo] && typeof _byRepo[repo] === 'object') ? _byRepo[repo] : (_lastUsed || (_isLegacyFlat ? _allSs : {})); + const _modelSs = (_byRepo[repo] && typeof _byRepo[repo] === 'object') ? _byRepo[repo] : null; + const _repoForcedBackend = !!(_modelSs && _modelSs._forceBackend); + const _isMiniMaxM3 = _isMiniMaxM3Model({ ...m, repo_id: repo }); + const _isMiniMaxM2 = _isMiniMaxM2Model({ ...m, repo_id: repo }); + const _isMiniMaxMSeries = _isMiniMaxM3 || _isMiniMaxM2; + const _toolParserDefault = _detectToolParser(repo); + const _isStepFunStep = _toolParserDefault === 'step3p5'; + const _nativeToolDefault = _isMiniMaxMSeries || _isStepFunStep; + const _reasoningDefault = _isMiniMaxMSeries || _isStepFunStep; + const _expertParallelDefault = _isMiniMaxMSeries || _isStepFunStep; + const svm = (k, def) => (_modelSs && _hasOwn(_modelSs, k)) ? _modelSs[k] : def; const _serveTarget = _selectedServeTarget(); const _backendChoices = _backendChoicesForTarget(_serveTarget); const _allowedBackends = new Set(_backendChoices.map(([v]) => v)); const detectedBackend = _detectBackend(m).backend; - let defaultBackend = (ss._forceBackend && ss.backend && _allowedBackends.has(ss.backend)) + let defaultBackend = (_repoForcedBackend && ss.backend && _allowedBackends.has(ss.backend)) ? ss.backend : detectedBackend; if (!_allowedBackends.has(defaultBackend)) defaultBackend = _backendChoices[0]?.[0] || detectedBackend; - const savedMatchesBackend = !!ss._forceBackend || (ss.backend || 'vllm') === detectedBackend; + const savedMatchesBackend = _repoForcedBackend || (ss.backend || 'vllm') === detectedBackend; const sv = (k, def) => (ss[k] !== undefined && savedMatchesBackend) ? ss[k] : def; - const defaultTp = defaultBackend === 'llamacpp' ? '1' : sv('tp', '1'); + const defaultTp = defaultBackend === 'llamacpp' ? '1' : sv('tp', _isMiniMaxMSeries ? '8' : '1'); const detectedGpuIds = _allGpuIds(_getGpuToggleTotal?.()); const defaultGpus = defaultBackend === 'llamacpp' ? '0' @@ -568,18 +1132,30 @@ function _rerenderCachedModels() { // OOMs. _detectModelOptimizations seeds opts.kvCacheDtype for // those families; honour it unless the user has a saved override. const _kvOptsCheck = _detectModelOptimizations(repo); - const _kvAutoDefault = (_kvOptsCheck && _kvOptsCheck.kvCacheDtype) || 'auto'; + const _kvAutoDefault = (_kvOptsCheck && _kvOptsCheck.kvCacheDtype) || (_isMiniMaxMSeries ? 'fp8' : 'auto'); const _kvSelected = sv('vllm_kv_cache_dtype', _kvAutoDefault); const vllmKvCacheOpts = ['auto','fp8'].map(d => ``).join(''); const _l = (name, tip) => `${name}?`; const _ggufChoices = _runnableGgufFiles(m); const _savedGguf = String(sv('gguf_file', '') || ''); + const _preferredGgufInclude = String(sv('_preferredGgufInclude', '') || '').replace(/\*/g, '').toLowerCase(); + const _preferredGguf = _preferredGgufInclude + ? (_ggufChoices.find(f => String(f.rel_path || '').toLowerCase().includes(_preferredGgufInclude)) + || _ggufChoices.find(f => String(f.name || '').toLowerCase().includes(_preferredGgufInclude))) + : null; const _defaultGguf = _ggufChoices.some(f => f.rel_path === _savedGguf) ? _savedGguf + : (_preferredGguf?.rel_path || '') + ? _preferredGguf.rel_path : (_ggufChoices[0]?.rel_path || ''); const _ggufOptions = _ggufChoices.map(f => `` ).join(''); + const _minimaxM3Snapshot = '/home/pewds/.cache/huggingface/hub/models--cyankiwi--MiniMax-M3-AWQ-INT4/snapshots/4082acbbec1236d21828d55b6bb0fe02ade4ab5b'; + const _defaultServeModel = _isMiniMaxM3 ? _minimaxM3Snapshot : (m.is_local_dir && m.path ? `${m.path}/${repo}` : repo); + const _savedModelPath = String(svm('model_path', _defaultServeModel) || '').trim(); + const _modelPathValue = _isMiniMaxM3 && (!_savedModelPath || _savedModelPath === repo) ? _minimaxM3Snapshot : _savedModelPath; + const _defaultServedModelName = _isMiniMaxM3 ? repo : ''; // Build save slots const _allPresets = _loadPresets(); const _repoShort = repo.split('/').pop(); @@ -596,17 +1172,16 @@ function _rerenderCachedModels() { const _arrowTitle = _modelPresets.length > 0 ? `${_modelPresets.length} saved launch config${_modelPresets.length === 1 ? '' : 's'} for ${_repoShort} — click ▾ to load or delete` : `No saved launch configs for ${_repoShort} yet — click Save to add one`; - // Wrap the Save split in a
`; - // Row 3a: Checkboxes (llama.cpp-only) + // Group 4 — llama.cpp toggles. Single row of checkboxes, GPU-only + // ones (Flash Attn, Allow CPU overflow) hide + // automatically in CPU mode. Order: perf-critical → safety → I/O → + // niche. MTP Spec sits last because it owns its own numstep widget + // and is the widest item. panelHtml += `
`; - panelHtml += ``; - panelHtml += ``; - panelHtml += ``; - panelHtml += ``; + panelHtml += ``; + panelHtml += ``; + panelHtml += ``; + panelHtml += ``; + panelHtml += ``; + panelHtml += ``; panelHtml += `
`; // Row 3b: Checkboxes (diffusers) panelHtml += `
`; @@ -832,24 +1467,20 @@ function _rerenderCachedModels() { // Command preview + actions. Wrap the textarea so a floating Copy // button can sit at its top-right corner — same pattern as the chat // run-output panel. + panelHtml += `
`; + panelHtml += `Launch command`; panelHtml += `
`; panelHtml += ``; panelHtml += ``; panelHtml += `
`; + panelHtml += `
`; panelHtml += `
`; - // Split button: main "Clear Server" + caret that opens Probe / Cancel. - // The .cookbook-gpu-probe button stays in the DOM but hidden so the - // existing event-listener wiring further down keeps working — the - // popup just programmatically clicks it. - panelHtml += ``; - panelHtml += ``; - panelHtml += ``; - panelHtml += ``; - panelHtml += ``; - // Copy moved inside the command textarea (top-right). Spacer then - // pushes Cancel + Launch to the right. - panelHtml += ``; panelHtml += ``; + // Copy moved inside the command textarea (top-right). Spacer then + // pushes Clear Server + Launch to the right. + panelHtml += ``; + panelHtml += ``; + panelHtml += ``; // Launch + a small ^ that opens an inline schedule form. The form // creates a ScheduledTask (action=cookbook_serve), so the schedule // ends up in the existing Tasks UI for edit/delete/pause. @@ -857,7 +1488,7 @@ function _rerenderCachedModels() { panelHtml += ``; // Chevron points DOWN because the schedule form opens beneath the // panel — the arrow signals the direction of motion, not menu state. - panelHtml += ``; + panelHtml += ``; panelHtml += ``; panelHtml += `
`; panelHtml += `
`; @@ -869,6 +1500,21 @@ function _rerenderCachedModels() { const panel = item.querySelector('.hwfit-serve-panel'); // Scroll the serve panel into view within its nearest scrollable ancestor requestAnimationFrame(() => panel.scrollIntoView({ block: 'nearest', behavior: 'smooth' })); + // Firefox-mobile fallback: the CSS that grows the cached-list and + // expanded card uses :has(.doclib-card-expanded), which Firefox + // mobile doesn't support — so the panel stays collapsed and the + // form is unusable. Pin explicit px heights here. On Chromium/ + // WebKit the !important CSS still wins, so this is a no-op there. + // (See project_skills_expand_firefox memory note.) + requestAnimationFrame(() => { + try { + const _itemH = Math.max(item.scrollHeight, item.getBoundingClientRect().height); + if (_itemH > 0) item.style.maxHeight = _itemH + 'px'; + const _listH = Math.max(list.scrollHeight, list.getBoundingClientRect().height); + if (_listH > 0) list.style.maxHeight = _listH + 'px'; + list.style.minHeight = _listH + 'px'; + } catch {} + }); // Build command preview function updateCmd() { @@ -878,7 +1524,7 @@ function _rerenderCachedModels() { else f[el.dataset.field] = el.value; }); const backend = f.backend || 'vllm'; - const serveModel = m.is_local_dir && m.path ? `${m.path}/${repo}` : repo; + const serveModel = (f.model_path || '').trim() || (m.is_local_dir && m.path ? `${m.path}/${repo}` : repo); if (backend === 'llamacpp') { const ggufChoices = _runnableGgufFiles(m); const selectedGguf = ggufChoices.find(file => file.rel_path === f.gguf_file); @@ -904,11 +1550,19 @@ function _rerenderCachedModels() { } if (f.reasoning_parser) { const _rpEl2 = panel.querySelector('[data-field="reasoning_parser"]'); - f._reasoning_parser_value = _rpEl2?.dataset?.parser || 'qwen3'; + f._reasoning_parser_value = _rpEl2?.dataset?.parser || ''; + } + if (f.vllm_env_preset === 'minimax_m3_cuda') { + const existingEnv = String(f.extra_env || '').trim(); + const envParts = existingEnv ? existingEnv.split(/\s+/) : []; + const hasEnv = (key) => envParts.some(p => p.startsWith(`${key}=`)); + if (!hasEnv('VLLM_TARGET_DEVICE')) envParts.unshift('VLLM_TARGET_DEVICE=cuda'); + if (!hasEnv('VLLM_USE_FLASHINFER_SAMPLER')) envParts.push('VLLM_USE_FLASHINFER_SAMPLER=0'); + f.extra_env = envParts.join(' '); } let cmd = _buildServeCmd(f, serveModel, backend); if (f.extra && f.extra.trim()) cmd += ' ' + f.extra.trim(); - const _ce2 = panel.querySelector('.hwfit-serve-cmd'); _ce2.value = cmd; _ce2.style.height = 'auto'; _ce2.style.height = _ce2.scrollHeight + 'px'; + const _ce2 = panel.querySelector('.hwfit-serve-cmd'); _ce2.value = _formatServeCmdPreview(cmd); _ce2.style.height = 'auto'; _ce2.style.height = _ce2.scrollHeight + 'px'; panel._cmd = cmd; panel._host = f.host || ''; return cmd; @@ -923,7 +1577,94 @@ function _rerenderCachedModels() { // - panel._modelCtxMax: the model's actual trained limit (set by the // profiles fetch below) — a tighter, model-specific cap when known. const ABSOLUTE_CTX_MAX = 1048576; // 1M tokens — above any real n_ctx_train + panel._modelCtxMax = panel._modelCtxMax || _knownModelContextMax(m) || 0; + panel._modelWeightsGb = panel._modelWeightsGb || 0; + panel._fitSystem = panel._fitSystem || null; const _ctxEl0 = panel.querySelector('[data-field="ctx"]'); + const _ctxAutoNote = panel.querySelector('.hwfit-auto-ctx-note'); + const _ctxCalcBtn = panel.querySelector('.hwfit-context-calc-btn'); + if (_ctxEl0) _ctxEl0.dataset.autoCtx = '0'; + panel._contextProfileData = panel._contextProfileData || null; + function _collectServeFields() { + const f = {}; + panel.querySelectorAll('.hwfit-sf').forEach(el => { + if (el.type === 'checkbox') f[el.dataset.field] = el.checked; + else f[el.dataset.field] = el.value; + }); + return f; + } + function _updateRecommendedCtx(apply = true) { + if (!_ctxEl0) return; + const f = _collectServeFields(); + const backend = f.backend || 'vllm'; + let fit = null; + if (backend === 'vllm' || backend === 'sglang') { + fit = _estimateVllmContextFit(m, f, panel._modelCtxMax, panel._modelWeightsGb, panel._fitSystem); + } else if (backend === 'llamacpp' || backend === 'ollama') { + const ggufGb = _selectedGgufSizeGb(m, f.gguf_file); + fit = _estimateLlamaContextFit(m, f, panel._modelCtxMax, ggufGb || panel._modelWeightsGb, panel._fitSystem, panel._contextProfileData); + } else { + if (_ctxAutoNote) _ctxAutoNote.textContent = ''; + return; + } + if (!fit) { + if (_ctxAutoNote) _ctxAutoNote.textContent = ''; + return; + } + if ((fit.needsHardwareScan || fit.needsModelSize) && !fit.ctx) { + if (_ctxAutoNote) { + _ctxAutoNote.textContent = fit.reason; + _ctxAutoNote.title = fit.reason; + } + return; + } + if (_ctxAutoNote) { + _ctxAutoNote.textContent = `Auto ${fit.ctx.toLocaleString()} · ${fit.reason}`; + const _llamaMemoryLabel = String(f.llama_mode || '').toLowerCase() === 'unified' || f.unified_mem + ? 'unified system memory' + : 'selected GPU memory'; + _ctxAutoNote.title = backend === 'llamacpp' || backend === 'ollama' + ? `Estimated from scanned GGUF/model size, trained context limit, and ${_llamaMemoryLabel} for llama.cpp KV cache.` + : `Estimated from model size, selected GPU VRAM, GPU utilization, TP, and KV dtype.`; + } + if (apply && _ctxEl0.dataset.autoCtx === '1') { + const next = String(fit.ctx); + if (_ctxEl0.value !== next) { + _ctxEl0.value = next; + updateCmd(); + } + } + } + async function _loadContextProfile() { + const target = _selectedServeTarget(panel); + const host = (target.host || '').trim(); + const params = new URLSearchParams({ model: repo }); + const profileModelPath = panel.querySelector('[data-field="model_path"]')?.value?.trim(); + if (profileModelPath && profileModelPath !== repo) params.set('model_path', profileModelPath); + if (host) { + params.set('host', host); + const _sp = (_serverByVal?.(target.serverKey || host) || (_es.servers || []).find(s => s.host === host) || {}).port; + if (_sp) params.set('ssh_port', _sp); + } + const res = await fetch(`/api/hwfit/profiles?${params}`); + const data = await res.json(); + const ctxMax = Number(data && data.model_ctx_max) || 0; + const weightsGb = Number(data && data.model_weights_gb) || 0; + if (data && data.system && typeof data.system === 'object' && !data.system.error) { + panel._fitSystem = data.system; + } + panel._contextProfileData = data || null; + if (weightsGb > 0) panel._modelWeightsGb = weightsGb; + if (ctxMax > 0) { + panel._modelCtxMax = Math.max(ctxMax, _knownModelContextMax(m) || 0); + _clampCtx(false); + } + if (_ctxAutoNote && data?.model_probe_error && (!ctxMax || !weightsGb)) { + _ctxAutoNote.textContent = data.model_probe_error; + _ctxAutoNote.title = data.model_probe_error; + } + return { ctxMax, weightsGb, data }; + } function _clampCtx(announce) { if (!_ctxEl0) return; const cap = panel._modelCtxMax > 0 ? panel._modelCtxMax : ABSOLUTE_CTX_MAX; @@ -936,9 +1677,58 @@ function _rerenderCachedModels() { } } if (_ctxEl0) { - _ctxEl0.addEventListener('change', () => _clampCtx(false)); + _ctxEl0.addEventListener('input', () => { _ctxEl0.dataset.autoCtx = '0'; }); + _ctxEl0.addEventListener('change', () => { _ctxEl0.dataset.autoCtx = '0'; _clampCtx(false); _updateRecommendedCtx(false); }); _ctxEl0.addEventListener('blur', () => _clampCtx(false)); + if (_ctxCalcBtn) { + let _ctxAutoTouchHandled = false; + const _runContextAuto = async () => { + if (_ctxCalcBtn.disabled) return; + const oldHtml = _ctxCalcBtn.innerHTML; + let calcWp = null; + _ctxCalcBtn.disabled = true; + _ctxCalcBtn.textContent = ''; + try { + calcWp = spinnerModule.createWhirlpool(12); + calcWp.element.classList.add('hwfit-context-calc-spinner'); + _ctxCalcBtn.appendChild(calcWp.element); + } catch (_) { + _ctxCalcBtn.textContent = '...'; + } + try { + await _loadContextProfile(); + } catch (err) { + if (_ctxAutoNote) { + _ctxAutoNote.textContent = 'context scan failed'; + _ctxAutoNote.title = err?.message || 'context scan failed'; + } + } finally { + if (calcWp) calcWp.destroy(); + _ctxCalcBtn.disabled = false; + _ctxCalcBtn.innerHTML = oldHtml; + } + _ctxEl0.dataset.autoCtx = '1'; + _updateRecommendedCtx(true); + _ctxEl0.dataset.autoCtx = '0'; + _clampCtx(false); + }; + _ctxCalcBtn.addEventListener('pointerup', (ev) => { + if (ev.pointerType !== 'touch') return; + ev.preventDefault(); + ev.stopPropagation(); + _ctxAutoTouchHandled = true; + _runContextAuto(); + setTimeout(() => { _ctxAutoTouchHandled = false; }, 350); + }); + _ctxCalcBtn.addEventListener('click', async (ev) => { + ev.preventDefault(); + ev.stopPropagation(); + if (_ctxAutoTouchHandled) return; + await _runContextAuto(); + }); + } _clampCtx(false); // fix any stale/preset value already present + _updateRecommendedCtx(false); } // Tighten the ctx slider's upper bound to the model's trained limit. @@ -948,20 +1738,8 @@ function _rerenderCachedModels() { // the rest of the serve panel was off — but this clamp is essential. (async () => { try { - const host = (_es.remoteHost || '').trim(); - const params = new URLSearchParams({ model: repo }); - if (host) { - params.set('host', host); - const _sp = (_es.servers || []).find(s => s.host === host)?.port; - if (_sp) params.set('ssh_port', _sp); - } - const res = await fetch(`/api/hwfit/profiles?${params}`); - const data = await res.json(); - const ctxMax = Number(data && data.model_ctx_max) || 0; - if (ctxMax > 0) { - panel._modelCtxMax = ctxMax; - _clampCtx(false); - } + const { ctxMax, weightsGb } = await _loadContextProfile(); + if (ctxMax > 0 || weightsGb > 0) _updateRecommendedCtx(false); } catch { /* clamp falls back to the static default */ } })(); @@ -1024,9 +1802,9 @@ function _rerenderCachedModels() { // custom picker so the dropdown lists "[V] vLLM", "[⚡] SGLang", etc. const _BACKEND_GLYPHS = { vllm: '', - sglang: '', - llamacpp: '', - ollama: '', + sglang: '', + llamacpp: '', + ollama: '', diffusers: '', }; @@ -1165,7 +1943,11 @@ function _rerenderCachedModels() { const { pkg, target } = await _fetchServeRuntimePackage(panel, backend); if (panel._runtimeReadinessSeq !== seq) return; _writeNote(_runtimeNoteText(backend, pkg, target)); - if (!pkg?.installed) { + if (pkg?.installed === null || pkg?.probe_error) { + note.style.color = 'var(--fg-muted)'; + note.style.borderColor = 'color-mix(in srgb, var(--fg) 16%, transparent)'; + note.style.background = 'color-mix(in srgb, var(--fg) 4%, transparent)'; + } else if (!pkg?.installed) { note.style.color = 'var(--red)'; note.style.borderColor = 'color-mix(in srgb, var(--red) 40%, transparent)'; note.style.background = 'color-mix(in srgb, var(--red) 8%, transparent)'; @@ -1288,6 +2070,31 @@ function _rerenderCachedModels() { }); const _gf = panel.querySelector('[data-field="gpus"]'); if (_gf) _gf.value = activeGpus.join(','); + { + const modeHidden = panel.querySelector('[data-field="llama_mode"]'); + const unifiedHidden = panel.querySelector('[data-field="unified_mem"]'); + const loadedUnified = ['1', 'true', 'yes', 'on'].includes(String(unifiedHidden?.value || '').toLowerCase()); + const loadedMode = loadedUnified && modeHidden?.value !== 'cpu' + ? 'unified' + : (modeHidden?.value || 'gpu'); + if (modeHidden) modeHidden.value = loadedMode; + if (unifiedHidden) unifiedHidden.value = loadedMode === 'unified' ? '1' : ''; + const modeGroup = panel.querySelector('[data-llama-mode-toggle]'); + if (modeGroup) { + modeGroup.querySelectorAll('.mode-toggle-btn').forEach(btn => { + const isActive = btn.dataset.llamaMode === loadedMode; + btn.classList.toggle('active', isActive); + btn.setAttribute('aria-pressed', isActive ? 'true' : 'false'); + }); + modeGroup.classList.toggle('mode-right', false); + modeGroup.classList.toggle('mode-mid', loadedMode === 'gpu'); + modeGroup.classList.toggle('mode-third', loadedMode === 'unified'); + } + panel.classList.toggle('cookbook-llama-cpu-mode', loadedMode === 'cpu'); + panel.querySelectorAll('.cookbook-llama-gpu-only').forEach(el => { + el.style.display = loadedMode === 'cpu' ? 'none' : ''; + }); + } updateBackendVisibility(); updateRuntimeReadinessNote(); updateCmd(); @@ -1333,7 +2140,7 @@ function _rerenderCachedModels() { if (el.type === 'checkbox') fields[el.dataset.field] = el.checked; else fields[el.dataset.field] = el.value; }); - presets.push({ name: shortName, model: repo, cmd, remoteHost: host, port: fields.port || '8000', label, fields }); + presets.push(_redactServeStateForStorage({ name: shortName, model: repo, cmd, remoteHost: host, port: fields.port || '8000', label, fields })); _savePresets(presets); uiModule.showToast(`Saved "${label}"`); _updateSavedToggleLabel(); @@ -1345,7 +2152,12 @@ function _rerenderCachedModels() { // fixed at the toggle and right-aligned to it. function _showSavedConfigMenu(anchor) { document.querySelectorAll('.cookbook-saved-menu').forEach(d => { if (typeof d._dismiss === 'function') d._dismiss(); else d.remove(); }); - const modelSlots = _presetsForModel(_loadPresets(), repo); + const modelSlots = _presetsForModel(_loadPresets(), repo) + .map((preset, slotIdx) => ({ preset, slotIdx })) + .sort((a, b) => { + const favDelta = (b.preset.favorite ? 1 : 0) - (a.preset.favorite ? 1 : 0); + return favDelta || (a.slotIdx - b.slotIdx); + }); const dropdown = document.createElement('div'); dropdown.className = 'dropdown cookbook-saved-menu'; let closeMenu = () => { dropdown.remove(); anchor.classList.remove('cookbook-menu-active'); }; @@ -1362,13 +2174,18 @@ function _rerenderCachedModels() { empty.textContent = 'No saved configs yet'; dropdown.appendChild(empty); } - modelSlots.forEach((p, idx) => { + modelSlots.forEach(({ preset: p, slotIdx }, idx) => { const it = document.createElement('div'); - it.className = 'dropdown-item-compact'; + it.className = 'dropdown-item-compact' + (p.favorite ? ' cookbook-saved-favorite' : ''); it.style.cssText = 'display:flex;align-items:center;justify-content:space-between;gap:8px;'; const lbl = document.createElement('span'); lbl.textContent = p.label || `Config ${idx + 1}`; lbl.style.cssText = 'flex:1;min-width:0;overflow:hidden;text-overflow:ellipsis;white-space:nowrap;'; + const fav = document.createElement('button'); + fav.type = 'button'; + fav.className = 'cookbook-saved-fav-btn' + (p.favorite ? ' active' : ''); + fav.title = p.favorite ? 'Unfavorite' : 'Favorite'; + fav.innerHTML = ''; const del = document.createElement('button'); del.type = 'button'; del.innerHTML = '×'; @@ -1377,6 +2194,12 @@ function _rerenderCachedModels() { del.addEventListener('mouseenter', () => { del.style.color = '#f44'; }); del.addEventListener('mouseleave', () => { del.style.color = 'var(--fg-muted)'; }); it.appendChild(lbl); + if (p.favorite) { + const badge = document.createElement('span'); + badge.className = 'memory-cat-badge memory-cat-pinned cookbook-saved-fav-badge'; + badge.textContent = 'pinned'; + it.appendChild(badge); + } if (p.confirmedWorking) { const badge = document.createElement('span'); badge.className = 'cookbook-saved-confirmed'; @@ -1384,13 +2207,14 @@ function _rerenderCachedModels() { badge.innerHTML = ''; it.appendChild(badge); } + it.appendChild(fav); it.appendChild(del); it.addEventListener('click', (e) => { - if (e.target === del) return; + if (e.target === del || e.target === fav || fav.contains(e.target)) return; e.stopPropagation(); // Close the menu FIRST so it always dismisses, even if loading throws. closeMenu(); - _loadSlotIntoPanel(idx); + _loadSlotIntoPanel(slotIdx); // Confirm the click landed — loading is silent otherwise, so it was // unclear the settings actually changed. uiModule.showToast(`Loaded "${p.label || `Config ${idx + 1}`}"`); @@ -1401,16 +2225,27 @@ function _rerenderCachedModels() { setTimeout(() => _cmdBox.classList.remove('cookbook-cmd-flash'), 600); } }); + fav.addEventListener('click', (e) => { + e.stopPropagation(); + const cur = _loadPresets(); + const target = _presetsForModel(cur, repo)[slotIdx]; + if (target) { + target.favorite = !target.favorite; + _savePresets(cur.map(_redactServeStateForStorage)); + uiModule.showToast(target.favorite ? 'Favorited — pinned to top' : 'Unfavorited'); + _showSavedConfigMenu(anchor); + } + }); del.addEventListener('click', async (e) => { e.stopPropagation(); const label = p.label || `Config ${idx + 1}`; if (!await window.styledConfirm(`Delete saved config "${label}"?`, { confirmText: 'Delete', danger: true })) return; const cur = _loadPresets(); - const toRemove = _presetsForModel(cur, repo)[idx]; + const toRemove = _presetsForModel(cur, repo)[slotIdx]; if (toRemove) { const gi = cur.indexOf(toRemove); if (gi >= 0) cur.splice(gi, 1); - _savePresets(cur); + _savePresets(cur.map(_redactServeStateForStorage)); } uiModule.showToast(`Deleted "${label}"`); _updateSavedToggleLabel(); @@ -1482,6 +2317,7 @@ function _rerenderCachedModels() { } } updateCmd(); + try { _updateRecommendedCtx(false); } catch {} }); }); @@ -1489,6 +2325,65 @@ function _rerenderCachedModels() { const _probeBtn = panel.querySelector('.cookbook-gpu-probe'); const _clearBtn = panel.querySelector('.cookbook-gpu-clear'); const _splitArrow = panel.querySelector('.cookbook-gpu-split-arrow'); + const _launchMoreBtn = panel.querySelector('.hwfit-serve-schedule-arrow'); + if (_launchMoreBtn) { + _launchMoreBtn.addEventListener('click', (ev) => { + if (ev.__openScheduleDirect) return; + ev.preventDefault(); + ev.stopPropagation(); + document.querySelectorAll('.cookbook-launch-actions-menu').forEach(m => { if (typeof m._dismiss === 'function') m._dismiss(); else m.remove(); }); + const menu = document.createElement('div'); + menu.className = 'cookbook-task-dropdown cookbook-launch-actions-menu'; + let closeMenu = () => menu.remove(); + const mk = (label, cls, onClick) => { + const it = document.createElement('div'); + it.className = 'dropdown-item-compact' + (cls ? ' ' + cls : ''); + it.style.cssText = 'display:flex;align-items:center;gap:8px;'; + it.textContent = label; + it.addEventListener('click', (e) => { + e.stopPropagation(); + closeMenu(); + if (onClick) onClick(); + }); + return it; + }; + menu.appendChild(mk('Copy launch command', '', () => { + updateCmd(); + const cmdBox = panel.querySelector('.hwfit-serve-cmd'); + const cmd = (_cmdManuallyEdited && cmdBox) + ? cmdBox.value + : _formatServeCmdPreview(panel._cmd || cmdBox?.value || ''); + _copyText(cmd).then(() => uiModule.showToast('Launch command copied')); + })); + menu.appendChild(mk('Schedule', '', () => { + const direct = new MouseEvent('click', { bubbles: true, cancelable: true }); + direct.__openScheduleDirect = true; + _launchMoreBtn.dispatchEvent(direct); + })); + menu.appendChild(mk('Clear Server', 'cookbook-dropdown-danger', () => _clearBtn?.click())); + menu.appendChild(mk('Cancel', 'dropdown-cancel-mobile', () => {})); + const r = _launchMoreBtn.getBoundingClientRect(); + menu.style.position = 'fixed'; + menu.style.right = (window.innerWidth - r.right) + 'px'; + document.body.appendChild(menu); + { + const vv = window.visualViewport; + const viewTop = vv ? vv.offsetTop : 0; + const viewBottom = vv ? vv.offsetTop + vv.height : window.innerHeight; + const mh = menu.offsetHeight; + const m = 8; + let top = r.bottom + 4; + if (top + mh > viewBottom - m) { + const above = r.top - 4 - mh; + top = above >= viewTop + m ? above : Math.max(viewTop + m, viewBottom - mh - m); + } + menu.style.top = top + 'px'; + } + const _scrollClose = () => closeMenu(); + closeMenu = bindMenuDismiss(menu, () => { menu.remove(); window.removeEventListener('scroll', _scrollClose, true); }, (e) => !menu.contains(e.target) && e.target !== _launchMoreBtn); + window.addEventListener('scroll', _scrollClose, true); + }); + } // Split-button arrow opens a small popup with the secondary action // (Probe GPUs) + a Cancel item. The popup re-uses the same probe // logic by programmatically clicking the hidden .cookbook-gpu-probe. @@ -1867,8 +2762,69 @@ function _rerenderCachedModels() { updateRuntimeReadinessNote(); } updateCmd(); + if (['backend', 'tp', 'gpu_mem', 'vllm_kv_cache_dtype', 'gpus'].includes(e.target.dataset.field)) { + try { _updateRecommendedCtx(false); } catch {} + } }); }); + // llama.cpp CPU/GPU/Unified mode-toggle wiring. Clicking a mode + // flips the .active classes + marker class (so the sliding + // pill matches Agent/Chat), updates the hidden data-field input, + // and fires a change event so the existing field-change handler + // rebuilds the serve cmd (sets -ngl 99 vs -ngl 0 and unified env). + panel.querySelectorAll('[data-llama-mode-toggle]').forEach(group => { + group.querySelectorAll('.mode-toggle-btn').forEach(btn => { + btn.addEventListener('click', (e) => { + e.preventDefault(); e.stopPropagation(); + const want = btn.dataset.llamaMode; + if (!want) return; + group.querySelectorAll('.mode-toggle-btn').forEach(b => { + const isActive = b.dataset.llamaMode === want; + b.classList.toggle('active', isActive); + b.setAttribute('aria-pressed', isActive ? 'true' : 'false'); + }); + group.classList.toggle('mode-right', false); + group.classList.toggle('mode-mid', want === 'gpu'); + group.classList.toggle('mode-third', want === 'unified'); + const hidden = group.parentElement.querySelector('[data-field="llama_mode"]'); + if (hidden) { + hidden.value = want; + hidden.dispatchEvent(new Event('change', { bubbles: true })); + } + const unified = group.parentElement.querySelector('[data-field="unified_mem"]'); + if (unified) { + unified.value = want === 'unified' ? '1' : ''; + unified.dispatchEvent(new Event('change', { bubbles: true })); + } + // Hide every GPU-only control (chiclets, Tensor Split, + // Split Mode, Main GPU, Flash Attn, etc.) + // in CPU mode — `-ngl 0` ignores them and showing them + // implies they matter. + panel.classList.toggle('cookbook-llama-cpu-mode', want === 'cpu'); + panel.querySelectorAll('.cookbook-llama-gpu-only').forEach(el => { + el.style.display = (want === 'cpu') ? 'none' : ''; + }); + }); + }); + }); + // Apply the CPU-mode visibility on first render too, so a saved + // preset that loaded with llama_mode=cpu hides GPU controls + // immediately instead of flashing them then disappearing. + { + const _saved = panel.querySelector('[data-field="llama_mode"]')?.value || 'gpu'; + const _group = panel.querySelector('[data-llama-mode-toggle]'); + if (_group) { + _group.classList.toggle('mode-right', false); + _group.classList.toggle('mode-mid', _saved === 'gpu'); + _group.classList.toggle('mode-third', _saved === 'unified'); + } + const _unified = panel.querySelector('[data-field="unified_mem"]'); + if (_unified) _unified.value = _saved === 'unified' ? '1' : ''; + if (_saved === 'cpu') { + panel.classList.add('cookbook-llama-cpu-mode'); + panel.querySelectorAll('.cookbook-llama-gpu-only').forEach(el => { el.style.display = 'none'; }); + } + } // Themed +/- buttons next to spec_tokens — step the adjacent number input. panel.querySelectorAll('.hwfit-numstep-btn').forEach(btn => { btn.addEventListener('click', (e) => { @@ -1972,9 +2928,8 @@ function _rerenderCachedModels() { // rejects \n / \r outright (`Invalid characters in cmd`), so collapse // all whitespace to single spaces before launch — same effect as the // user manually re-flowing the textarea, no behavior change. - const _rawLaunchCmd = _cmdTextarea ? _cmdTextarea.value : panel._cmd; - const launchCmd = String(_rawLaunchCmd || '').replace(/\s+/g, ' ').trim(); - if (_cmdTextarea && _cmdTextarea.value !== launchCmd) _cmdTextarea.value = launchCmd; + const _rawLaunchCmd = (_cmdManuallyEdited && _cmdTextarea) ? _cmdTextarea.value : panel._cmd; + const launchCmd = _normalizeServeCmdForLaunch(_rawLaunchCmd); const serveState = {}; panel.querySelectorAll('.hwfit-sf').forEach(el => { if (el.type === 'checkbox') serveState[el.dataset.field] = el.checked; @@ -1996,9 +2951,10 @@ function _rerenderCachedModels() { try { const _runningMod = await import('./cookbookRunning.js'); const _hostStr = launchTarget.host || ''; + const _serverKeyStr = launchTarget.serverKey || (_hostStr || 'local'); const _active = (_runningMod._loadTasks ? _runningMod._loadTasks() : []).filter(t => t && t.type === 'serve' - && (t.remoteHost || '') === _hostStr + && ((t.remoteHost || '') === _hostStr || (t.remoteServerKey || '') === _serverKeyStr) && (t.status === 'running' || t.status === 'ready' || t._serveReady) ); if (_active.length) { @@ -2041,6 +2997,140 @@ function _rerenderCachedModels() { }); return; } + // llama.cpp VRAM-fit preflight. Catches the silent-CPU-fallback + // trap: when the model + KV cache exceed the selected GPUs' free + // VRAM, llama-cpp-python doesn't error — it pushes layers/KV to + // CPU and inference crawls at sub-1 tok/s. Off by default; can + // be bypassed per-launch via the dialog's "Allow CPU overflow" + // action, OR persistently by ticking the same-named checkbox. + if (serveState.backend === 'llamacpp' + && String(serveState.llama_mode || 'gpu') !== 'cpu' + && !serveState.llama_cpu_overflow) { + try { + const _ctx = Math.max(1, parseInt(serveState.ctx, 10) || 8192); + // Model size on disk — close enough for GPU footprint of a GGUF. + const _modelBytes = Number(m?.size_bytes || 0) || Math.round((Number(m?.size_gb || 0)) * 1024 * 1024 * 1024); + const _modelGb = _modelBytes / (1024 ** 3); + // KV cache heuristic. ~0.7MB / token / 7.5GB-of-model at fp16 + // KV, scaled linearly by model size. Imperfect but covers + // the common 7B–70B range within ~20% — good enough to catch + // overflow before it silently happens. + const _kvGbPerToken = _modelGb > 0 ? (_modelGb / 7.5) * 0.0007 : 0.0007; + const _kvGb = _ctx * _kvGbPerToken; + const _needGb = _modelGb + _kvGb; + const _selStr = (serveState.gpus || '').trim(); + const _selIdx = _selStr ? _selStr.split(',').map(s => parseInt(s.trim(), 10)).filter(n => Number.isFinite(n)) : [0]; + // Fetch FRESH GPU data per-launch — the hwfit cache may be + // stale or for a different host (e.g. user switched server + // picker without scanning), which used to silently skip the + // preflight and let the launch silently fall to CPU. + let _hwGpus = []; + try { + const _gh = (launchTarget.host || '').trim(); + const _gp = new URLSearchParams(); + if (_gh) { + _gp.set('host', _gh); + const _sp = (_serverByVal?.(launchTarget.serverKey || _gh) || {}).port; + if (_sp) _gp.set('ssh_port', _sp); + } + const _gr = await fetch('/api/cookbook/gpus' + (_gp.toString() ? '?' + _gp : ''), { credentials: 'same-origin' }); + if (_gr.ok) { + const _gd = await _gr.json(); + _hwGpus = Array.isArray(_gd) ? _gd : (_gd.gpus || []); + } + } catch {} + const _freeFor = (idx) => { + const g = _hwGpus[idx]; + const mb = g?.free_mb; + return Number.isFinite(mb) ? mb / 1024 : 0; + }; + const _selFreeGb = _selIdx.reduce((s, i) => s + _freeFor(i), 0); + // Skip the gate when we don't have any free-VRAM data (probe + // failed) — better to let the launch try than silently refuse + // on a missing data point. + if (_selFreeGb > 0 && _needGb > _selFreeGb && _modelGb > 0) { + // Suggest the smallest set of additional GPUs whose free + // VRAM closes the gap. Greedy by largest-free-first. + const _candidates = _hwGpus + .map((g, i) => ({ i, free: _freeFor(i) })) + .filter(x => !_selIdx.includes(x.i) && x.free > 0) + .sort((a, b) => b.free - a.free); + const _addGpus = []; + let _runFree = _selFreeGb; + for (const c of _candidates) { + _addGpus.push(c.i); _runFree += c.free; + if (_runFree >= _needGb) break; + } + const _canAddGpu = _runFree >= _needGb && _addGpus.length > 0; + // Recommend ctx that just-fits on current selection. + const _recCtxRaw = Math.floor((_selFreeGb - _modelGb) / _kvGbPerToken); + const _recCtx = Math.max(1024, Math.floor(_recCtxRaw / 1024) * 1024); + // Custom modal — styledConfirm only takes 2 buttons; this + // surface needs up to 4 actions (Reduce / Add GPUs / Allow / Cancel). + const _action = await new Promise(resolve => { + const ov = document.createElement('div'); + ov.className = 'modal'; + ov.style.cssText = 'display:flex;align-items:center;justify-content:center;z-index:10050;position:fixed;inset:0;background:rgba(0,0,0,0.4);'; + const _btnRow = []; + if (_recCtx > 1024 && _recCtx < _ctx) { + _btnRow.push(``); + } + if (_canAddGpu) { + _btnRow.push(``); + } + _btnRow.push(``); + _btnRow.push(``); + ov.innerHTML = ''; + document.body.appendChild(ov); + ov.addEventListener('click', (e) => { + const b = e.target.closest('[data-vram-action]'); + if (b) { ov.remove(); resolve(b.dataset.vramAction); } + else if (e.target === ov) { ov.remove(); resolve('cancel'); } + }); + }); + if (_action === 'cancel' || !_action) { _restoreLaunchBtn(); return; } + if (_action === 'reduce') { + const _ctxEl = panel.querySelector('[data-field="ctx"]'); + if (_ctxEl) { + _ctxEl.value = String(_recCtx); + serveState.ctx = String(_recCtx); + _ctxEl.dispatchEvent(new Event('change', { bubbles: true })); + } + } else if (_action === 'add_gpus') { + for (const i of _addGpus) { + const _b = panel.querySelector(`.cookbook-gpu-btn[data-gpu="${i}"]`); + if (_b && !_b.classList.contains('active')) _b.click(); + } + const _gpusEl = panel.querySelector('[data-field="gpus"]'); + if (_gpusEl) serveState.gpus = _gpusEl.value; + } else if (_action === 'allow_cpu') { + const _ov = panel.querySelector('[data-field="llama_cpu_overflow"]'); + if (_ov) { + _ov.checked = true; + _ov.dispatchEvent(new Event('change', { bubbles: true })); + } + serveState.llama_cpu_overflow = true; + } + // After mutation, rebuild the serve cmd preview so the + // launched cmd matches what the user just chose. + try { updateCmd(); } catch {} + } + } catch (_e) { + // Preflight is best-effort — never block on its own failure. + } + } // Pre-launch GPU probe — common failure pattern: vLLM/SGLang launched // on a host where no GPU is visible (driver missing, $CUDA_VISIBLE_DEVICES // unset, container without --gpus). Catch it BEFORE the user spends @@ -2135,8 +3225,9 @@ function _rerenderCachedModels() { try { cur = JSON.parse(localStorage.getItem(SERVE_STATE_KEY)) || {}; } catch {} const byRepo = (cur && cur._byRepo && typeof cur._byRepo === 'object') ? cur._byRepo : {}; const _saved = { ...serveState, _forceBackend: true }; + delete _saved._replaceTaskId; byRepo[repo] = _saved; - localStorage.setItem(SERVE_STATE_KEY, JSON.stringify({ _byRepo: byRepo, _lastUsed: _saved })); + localStorage.setItem(SERVE_STATE_KEY, JSON.stringify(_redactServeStateForStorage({ _byRepo: byRepo, _lastUsed: _saved }))); } catch {} const origEnv = _envState.env; const origEnvPath = _envState.envPath; @@ -2146,19 +3237,55 @@ function _rerenderCachedModels() { // Resolve the target host from the visible Server dropdown — the reliable // source. Relying on _envState.remoteHost silently sent serves to Local // when that value was stale/empty. Pass it explicitly to the launcher. - let serveHost = launchTarget.host || ''; - let _srvEnv = launchTarget.env || '', _srvEnvPath = launchTarget.venv || ''; + const serveHost = launchTarget.host || ''; + const serveServerKey = launchTarget.serverKey || ''; + const serveServerName = launchTarget.serverName || ''; + const _srvEnv = launchTarget.env || ''; + const _srvEnvPath = launchTarget.venv || ''; // The venv field wins; otherwise fall back to the env configured for the // selected server in Settings, so the activation isn't silently dropped // when the field is left blank (the per-server venv wasn't being applied). if (venvVal) { _envState.env = 'venv'; _envState.envPath = venvVal; } else if (_srvEnvPath) { _envState.env = (_srvEnv === 'conda' ? 'conda' : 'venv'); _envState.envPath = _srvEnvPath; } if (gpusVal) _envState.gpus = gpusVal; + // Preflight: launching a GPU engine (llama.cpp / vLLM / SGLang) + // against the local-in-container target on a host whose hwfit + // scan reports no GPU backend. That falls through to a CPU build + // / CPU inference path and is usually NOT what the user wants — + // they typically have a host-side GPU (AMD/Vulkan, NVIDIA on a + // different box) that the container can't see. Surface this so + // the user can pick the host as a remote target instead, or + // confirm they really meant CPU. + try { + const _isLocalInContainer = !serveHost; // empty serveHost == cookbook container's local + const _wantsGpu = ['llamacpp', 'vllm', 'sglang', 'diffusers'].includes(serveState.backend); + const _detectedBackend = String(_hwfitCache?.system?.backend || '').toLowerCase(); + const _gpuBackends = ['cuda', 'rocm', 'vulkan', 'metal', 'mps', 'apple']; + if (_isLocalInContainer && _wantsGpu && _detectedBackend && !_gpuBackends.includes(_detectedBackend)) { + const _proceed = await window.styledConfirm( + `The local (in-container) target has no GPU backend detected (hwfit reports: "${_detectedBackend || 'none'}"). ${serveState.backend.toUpperCase()} will run on CPU only and may be unusably slow.\n\nIf this machine has a GPU on the host, add the host as a server in Settings and target that instead. Otherwise launch anyway for CPU inference.`, + { + title: 'No GPU on local target', + confirmText: 'Launch anyway (CPU)', + cancelText: 'Cancel', + danger: true, + }, + ); + if (!_proceed) { + if (typeof _restoreLaunchBtn === 'function') _restoreLaunchBtn(); + _envState.env = origEnv; + _envState.envPath = origEnvPath; + _envState.gpus = origGpus; + return; + } + } + } catch { /* preflight is best-effort */ } try { await _withSpinner(_launchBtn, async () => { // Pass the exact form values so the running task can be re-opened // in the Serve panel pre-filled with these settings (Edit button). - await _launchServeTask(shortName, repo, launchCmd, serveState, serveHost); + const taskDisplayName = _serveTaskDisplayName(shortName, m, serveState); + await _launchServeTask(taskDisplayName, repo, launchCmd, serveState, serveHost, { serverKey: serveServerKey, serverName: serveServerName }); }); } finally { _envState.env = origEnv; @@ -2175,7 +3302,7 @@ function _rerenderCachedModels() { // copying collapses the whole serve panel mid-flight. e.preventDefault(); e.stopPropagation(); - const cmd = panel.querySelector('.hwfit-serve-cmd').value; + const cmd = _cmdManuallyEdited ? panel.querySelector('.hwfit-serve-cmd').value : _formatServeCmdPreview(panel._cmd); _copyText(cmd).then(() => { const btn = panel.querySelector('.hwfit-serve-copy'); const origHtml = btn.innerHTML; @@ -2219,7 +3346,6 @@ function _resolveCacheHost() { } async function _deleteCachedModel(repo, itemEl, skipConfirm = false, model = null) { - if (!skipConfirm && !(await uiModule.styledConfirm(`Delete ${repo} from cache?`, { confirmText: 'Delete', danger: true }))) return; const m = model || _cachedAllModels.find(x => x.repo_id === repo); // Delete the EXACT on-disk path the scan reported. Models in a custom // model dir live at /; HF-cache models at @@ -2235,13 +3361,33 @@ async function _deleteCachedModel(repo, itemEl, skipConfirm = false, model = nul } else { target = `~/.cache/huggingface/hub/models--${repo.replace(/\//g, '--')}`; } + let deleteChoice = { mode: 'repo' }; + const ggufFiles = _ggufFilesForModel(m); + if (!skipConfirm) { + if (ggufFiles.length > 1) { + deleteChoice = await _ggufDeleteChoice(repo, ggufFiles); + if (!deleteChoice) return; + } else if (!(await uiModule.styledConfirm(`Delete ${repo} from cache?`, { confirmText: 'Delete', danger: true }))) { + return; + } + } const host = _resolveCacheHost(); let cmd; if (_isWindows()) { + const _psSingleQuote = (value) => `'${String(value || '').replace(/'/g, "''")}'`; const winTarget = target.startsWith('~') ? target.replace(/^~/, '$env:USERPROFILE').replace(/\//g, '\\') : target.replace(/\//g, '\\'); - cmd = `Remove-Item -Recurse -Force "${winTarget}" -ErrorAction SilentlyContinue`; + if (deleteChoice.mode === 'files') { + const targets = deleteChoice.files + .map(f => _safeGgufRelPath(f.rel_path)) + .filter(Boolean) + .map(rel => `${winTarget}\\${rel.replace(/\//g, '\\')}`); + if (!targets.length) return; + cmd = targets.map(p => `Remove-Item -Force ${_psSingleQuote(p)} -ErrorAction SilentlyContinue`).join('; '); + } else { + cmd = `Remove-Item -Recurse -Force ${_psSingleQuote(winTarget)} -ErrorAction SilentlyContinue`; + } if (host) { const pf = _sshPrefix(_getPort(host)); cmd = `ssh ${pf}${host} "powershell -Command \\"${cmd}\\""`; @@ -2250,7 +3396,16 @@ async function _deleteCachedModel(repo, itemEl, skipConfirm = false, model = nul // $HOME expands inside double quotes; ~ would not, so normalize the // fallback. Quoting also handles spaces in custom model-dir paths. const unixTarget = target.startsWith('~') ? target.replace(/^~/, '$HOME') : target; - cmd = `rm -rf "${unixTarget}"`; + if (deleteChoice.mode === 'files') { + const targets = deleteChoice.files + .map(f => _safeGgufRelPath(f.rel_path)) + .filter(Boolean) + .map(rel => `${target.replace(/\/+$/, '')}/${rel}`); + if (!targets.length) return; + cmd = `rm -f ${targets.map(p => _shellPathExpr(p)).join(' ')} && find ${_shellPathExpr(target)} -type d -empty -delete`; + } else { + cmd = `rm -rf "${unixTarget}"`; + } if (host) cmd = _sshCmd(host, cmd, _getPort(host)); } // Deleting a large model (tens/hundreds of GB) can take a while, especially @@ -2275,7 +3430,13 @@ async function _deleteCachedModel(repo, itemEl, skipConfirm = false, model = nul body: JSON.stringify({ command: cmd }), }); if (!res.ok) { uiModule.showError(`Delete failed (${res.status})`); return; } - if (itemEl) { + if (deleteChoice.mode === 'files') { + if (m && Array.isArray(m.gguf_files)) { + const removed = new Set(deleteChoice.files.map(f => _safeGgufRelPath(f.rel_path))); + m.gguf_files = m.gguf_files.filter(f => !removed.has(_safeGgufRelPath(f.rel_path))); + } + await _fetchCachedModels(false); + } else if (itemEl) { itemEl.querySelector('.cookbook-delete-overlay')?.remove(); itemEl.style.transition = 'opacity 0.24s ease, transform 0.24s ease, max-height 0.28s ease, padding 0.28s ease, margin 0.28s ease'; itemEl.style.maxHeight = `${Math.max(itemEl.getBoundingClientRect().height, itemEl.scrollHeight)}px`; @@ -2289,9 +3450,9 @@ async function _deleteCachedModel(repo, itemEl, skipConfirm = false, model = nul requestAnimationFrame(() => { itemEl.style.maxHeight = '0'; }); await new Promise(resolve => setTimeout(resolve, 300)); if (itemEl.parentElement) itemEl.remove(); + // Drop from the in-memory list so a re-render/filter doesn't resurrect it. + _cachedAllModels = _cachedAllModels.filter(x => x.repo_id !== repo); } - // Drop from the in-memory list so a re-render/filter doesn't resurrect it. - _cachedAllModels = _cachedAllModels.filter(x => x.repo_id !== repo); } catch (e) { uiModule.showError('Delete failed: ' + (e && e.message ? e.message : e)); } finally { @@ -2352,7 +3513,7 @@ export async function openServePanelForRepo(repo, fields) { // overridable defaults. const _seeded = { ...fields, _forceBackend: true }; byRepo[repo] = _seeded; - localStorage.setItem(SERVE_STATE_KEY, JSON.stringify({ _byRepo: byRepo, _lastUsed: _seeded })); + localStorage.setItem(SERVE_STATE_KEY, JSON.stringify(_redactServeStateForStorage({ _byRepo: byRepo, _lastUsed: _seeded }))); } catch {} } // Switch to the Serve tab (its click handler triggers _fetchCachedModels). @@ -2580,6 +3741,7 @@ export function initServe(shared) { _getPort = shared._getPort; _sshPrefix = shared._sshPrefix; _serverByVal = shared._serverByVal; + _serverKey = shared._serverKey; _getPlatform = shared._getPlatform; _isWindows = shared._isWindows; _isMetal = shared._isMetal; diff --git a/static/js/document.js b/static/js/document.js index b314c0589..3fb825656 100644 --- a/static/js/document.js +++ b/static/js/document.js @@ -24,6 +24,7 @@ import * as Modals from './modalManager.js'; let _autoDetectDebounce = null; let _autoTitleDebounce = null; let _autoSaveDebounce = null; + let _lastAutoSaveErrorAt = 0; let _animationInProgress = false; let _animationCancel = null; // function to cancel current animation let _htmlPreviewActive = false; // true when inline HTML preview iframe is showing @@ -154,6 +155,20 @@ import * as Modals from './modalManager.js'; addDocToTabs, syncDocIndicator: _syncDocIndicator, }); + const sidebarNewDocBtn = document.getElementById('library-new-doc-btn'); + if (sidebarNewDocBtn && !sidebarNewDocBtn.dataset.docNewWired) { + sidebarNewDocBtn.dataset.docNewWired = '1'; + sidebarNewDocBtn.addEventListener('click', async (e) => { + e.preventDefault(); + e.stopPropagation(); + try { + await newDocument(); + } catch (err) { + console.error('Failed to create document from sidebar button:', err); + if (uiModule) uiModule.showError('Failed to create document'); + } + }); + } _maybeOpenDocFromHash(); window.addEventListener('hashchange', _maybeOpenDocFromHash); } @@ -2686,6 +2701,104 @@ import * as Modals from './modalManager.js'; await _uploadComposeFiles(files); } + function _isMarkdownImageFile(file) { + if (!file) return false; + if ((file.type || '').toLowerCase().startsWith('image/')) return true; + return /\.(avif|bmp|gif|jpe?g|png|svg|webp)$/i.test(file.name || ''); + } + + function _markdownImageAlt(name) { + const base = String(name || 'image').replace(/\.[^.]+$/, '').trim() || 'image'; + return base.replace(/[\[\]\n\r]/g, ' ').replace(/\s+/g, ' ').trim() || 'image'; + } + + function _activeDocLanguage() { + const doc = activeDocId && docs.get(activeDocId); + return ((doc && doc.language) || document.getElementById('doc-language-select')?.value || '').toLowerCase(); + } + + function _scheduleMarkdownImageAutosave(ta) { + updateLineNumbers(ta.value); + const codeEl = document.getElementById('doc-editor-code'); + if (codeEl && !codeEl.dataset.hasDiff) { + codeEl.textContent = ta.value + '\n'; + codeEl.style.minHeight = ta.scrollHeight + 'px'; + } + clearTimeout(_hlDebounce); + _hlDebounce = setTimeout(syncHighlighting, 80); + clearTimeout(_autoTitleDebounce); + _autoTitleDebounce = setTimeout(() => autoTitleFromContent(ta.value), 600); + clearTimeout(_autoSaveDebounce); + _autoSaveDebounce = setTimeout(() => { saveDocument({ silent: true }); }, 800); + } + + function _insertMarkdownImages(uploadedFiles) { + const ta = document.getElementById('doc-editor-textarea'); + if (!ta) return; + const files = Array.isArray(uploadedFiles) ? uploadedFiles : []; + if (!files.length) return; + + const start = ta.selectionStart || 0; + const end = ta.selectionEnd || start; + const before = ta.value.slice(0, start); + const after = ta.value.slice(end); + const lines = files.map(file => { + const id = encodeURIComponent(file.id || file.file_id || ''); + const alt = _markdownImageAlt(file.name || file.filename); + return id ? `![${alt}](/api/upload/${id})` : ''; + }).filter(Boolean); + if (!lines.length) return; + + const prefix = before && !before.endsWith('\n') ? '\n' : ''; + const suffix = after && !after.startsWith('\n') ? '\n' : ''; + const insert = `${prefix}${lines.join('\n\n')}${suffix}`; + _replaceRange(ta, start, end, insert); + const caret = start + insert.length; + ta.selectionStart = caret; + ta.selectionEnd = caret; + ta.focus(); + _scheduleMarkdownImageAutosave(ta); + _refreshMarkdownPreviewIfVisible(activeDocId, ta.value); + } + + async function _uploadMarkdownImages(files) { + const images = Array.from(files || []).filter(_isMarkdownImageFile); + if (!images.length) { + if (uiModule) uiModule.showError('Choose an image file'); + return; + } + if (_activeDocLanguage() !== 'markdown') { + if (uiModule) uiModule.showError('Switch the document to markdown before inserting images'); + return; + } + + const fd = new FormData(); + images.forEach(file => fd.append('files', file)); + try { + const res = await fetch(`${API_BASE}/api/upload`, { + method: 'POST', + credentials: 'same-origin', + body: fd, + }); + let data = null; + try { data = await res.json(); } catch (_) {} + if (!res.ok) throw new Error((data && (data.error || data.detail)) || `HTTP ${res.status}`); + const uploaded = Array.isArray(data?.files) ? data.files : []; + if (!uploaded.length) throw new Error('No uploaded files returned'); + _insertMarkdownImages(uploaded); + if (uiModule) uiModule.showToast(images.length === 1 ? 'Image inserted' : 'Images inserted'); + } catch (err) { + console.error('Failed to insert markdown image:', err); + if (uiModule) uiModule.showError('Failed to insert image'); + } + } + + async function _handleMarkdownImageUpload(e) { + const files = e.target.files; + e.target.value = ''; + await _uploadMarkdownImages(files); + } + function _renderComposeAttachments() { const container = document.getElementById('doc-email-compose-atts'); if (!container) return; @@ -3752,9 +3865,12 @@ import * as Modals from './modalManager.js'; const res = await fetch(`${API_BASE}/api/document`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, + credentials: 'same-origin', body: JSON.stringify({ session_id: sessionId, title: '', content }), }); + if (!res.ok) throw new Error(`Document create failed: HTTP ${res.status}`); const doc = await res.json(); + if (!doc || !doc.id) throw new Error('Document create failed: missing id'); addDocToTabs(doc, sessionId); // Set the content into the map so switchToDoc preserves it const d = docs.get(doc.id); @@ -3981,6 +4097,7 @@ import * as Modals from './modalManager.js';
+