Merge remote-tracking branch 'origin/main' into visual-pr-playground

# Conflicts: # routes/cookbook_routes.py # routes/hwfit_routes.py # services/hwfit/fit.py # services/hwfit/models.py # static/js/cookbook-diagnosis.js # static/js/cookbook-hwfit.js # static/js/cookbook.js # static/js/cookbookRunning.js
2026-06-30 00:22:10 -04:00 · 2026-06-03 16:49:10 +09:00
parent eb79b76432 41a928f21b
commit 3706d756f3
569 changed files with 35252 additions and 3489 deletions
@@ -37,7 +37,8 @@ from routes.cookbook_helpers import (
    _validate_local_dir, _validate_ssh_port, _validate_gpus, _shell_path,
    _ps_squote, _bash_squote, _validate_serve_cmd, _parse_serve_phase,
    _safe_env_prefix, _local_tooling_path_export, _append_serve_preflight_exit_lines,
-    _append_serve_exit_code_lines, _cached_model_scan_script,
+    _append_serve_exit_code_lines, _append_llama_cpp_linux_accel_build_lines, _cached_model_scan_script,
+    _ollama_bind_from_cmd, _pip_install_fallback_chain, _pip_install_no_cache, _venv_safe_local_pip_install_cmd,
    ModelDownloadRequest, ServeRequest,
 )

@@ -148,6 +149,15 @@ def setup_cookbook_routes() -> APIRouter:
                "No GPUs are visible to the serve process.",
                [{"label": "clear Cookbook GPU selection or choose available GPUs", "op": "settings", "field": "gpus", "value": ""}],
            ),
+            (
+                r"Failed to infer device type|NVML Shared Library Not Found|No module named 'amdsmi'|platform is not available",
+                "vLLM could not find a supported GPU (CUDA or ROCm). "
+                "This machine may have integrated or unsupported graphics only.",
+                [
+                    {"label": "switch to llama.cpp (CPU/Metal, works without a discrete GPU)", "op": "manual"},
+                    {"label": "switch to Ollama (CPU/Metal, works without a discrete GPU)", "op": "manual"},
+                ],
+            ),
            (
                r"vllm.*command not found|No module named vllm|ERROR: vLLM is not installed",
                "vLLM is not installed or not in PATH on this server.",
@@ -163,6 +173,11 @@ def setup_cookbook_routes() -> APIRouter:
                "llama.cpp / llama-cpp-python dependencies are missing.",
                [{"label": "install llama.cpp dependencies or llama-cpp-python[server]", "op": "dependency", "package": "llama-cpp-python[server]"}],
            ),
+            (
+                r"No GGUF found on this host|no \.gguf file|No GGUF file found",
+                "No GGUF file found for this model on this host. The llama.cpp backend needs a .gguf file.",
+                [{"label": "download a GGUF build of this model (repo name usually ends in -GGUF, file like Q4_K_M.gguf)", "op": "manual"}],
+            ),
            (
                r"No module named 'torch'|No module named torch|No module named 'diffusers'|No module named diffusers",
                "Diffusion serving requires PyTorch and diffusers.",
@@ -368,11 +383,15 @@ def setup_cookbook_routes() -> APIRouter:
                encoding="utf-8",
            )
            argv = [os.environ.get("ComSpec", "cmd.exe"), "/c", str(script_path)]
+        env = os.environ.copy()
+        env["PYTHONUTF8"] = "1"
+        env["PYTHONIOENCODING"] = "utf-8"
        proc = subprocess.Popen(
            argv,
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL,
            stdin=subprocess.DEVNULL,
+            env=env,
            **detached_popen_kwargs(),
        )
        pid_path.write_text(str(proc.pid), encoding="utf-8")
@@ -432,12 +451,12 @@ def setup_cookbook_routes() -> APIRouter:
        # throughput. Retries set disable_hf_transfer to fall back to the plain,
        # slower-but-reliable downloader (resumes cleanly from the .incomplete files).
        # Use `python3 -m pip` not `pip` — macOS has no bare `pip` command.
-        lines.append("command -v hf >/dev/null 2>&1 || python3 -m pip install --user --break-system-packages -q -U huggingface_hub 2>/dev/null || python3 -m pip install -q -U huggingface_hub 2>/dev/null")
+        lines.append(f"command -v hf >/dev/null 2>&1 || {_pip_install_fallback_chain('huggingface_hub', upgrade=True)}")
        if req.disable_hf_transfer:
            lines.append("export HF_HUB_ENABLE_HF_TRANSFER=0")
            lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=4")
        else:
-            lines.append("python3 -c 'import hf_transfer' 2>/dev/null || python3 -m pip install --user --break-system-packages -q hf_transfer 2>/dev/null || python3 -m pip install -q hf_transfer 2>/dev/null")
+            lines.append(f"python3 -c 'import hf_transfer' 2>/dev/null || {_pip_install_fallback_chain('hf_transfer')}")
            lines.append("python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
            lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=8")

@@ -531,12 +550,18 @@ def setup_cookbook_routes() -> APIRouter:
                )
            # Ensure pip-user scripts (e.g. hf CLI installed via --user) are on PATH
            runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
-            # Install hf CLI + hf_transfer best-effort so future runs get the fast path.
+            # Install hf CLI + optional hf_transfer best-effort. Retries disable
+            # hf_transfer because the Rust parallel path is fast but has been
+            # flaky near the end of very large multi-file downloads.
            # Use --break-system-packages on PEP-668 systems (Arch, newer Debian) so it doesn't bail.
-            runner_lines.append("command -v hf >/dev/null 2>&1 || pip install --user --break-system-packages -q -U huggingface_hub 2>/dev/null || pip install -q -U huggingface_hub 2>/dev/null")
-            runner_lines.append("python3 -c 'import hf_transfer' 2>/dev/null || pip install --user --break-system-packages -q hf_transfer 2>/dev/null || pip install -q hf_transfer 2>/dev/null")
-            runner_lines.append("python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
-            runner_lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=8")
+            runner_lines.append(f"command -v hf >/dev/null 2>&1 || {_pip_install_fallback_chain('huggingface_hub', python_cmd='pip', upgrade=True)}")
+            if req.disable_hf_transfer:
+                runner_lines.append("export HF_HUB_ENABLE_HF_TRANSFER=0")
+                runner_lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=4")
+            else:
+                runner_lines.append(f"python3 -c 'import hf_transfer' 2>/dev/null || {_pip_install_fallback_chain('hf_transfer', python_cmd='pip')}")
+                runner_lines.append("python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
+                runner_lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=8")
            # Surface whether the HF token actually reached THIS server, so a gated
            # download's "not authorized" failure can be told apart from a missing
            # token (the token is masked — we only print applied / not-set).
@@ -547,15 +572,19 @@ def setup_cookbook_routes() -> APIRouter:
            runner_lines.append(f'  {hf_cmd} < /dev/null')
            runner_lines.append('elif python3 -c "import huggingface_hub" 2>/dev/null; then')
            runner_lines.append('  echo "hf CLI not found, using Python huggingface_hub..."')
-            runner_lines.append(f'  python3 -c "import os; from huggingface_hub import snapshot_download; snapshot_download(\'{req.repo_id}\'{_dl_pyarg}, max_workers=8)"')
+            runner_lines.append(f'  python3 -c "import os; from huggingface_hub import snapshot_download; snapshot_download(\'{req.repo_id}\'{_dl_pyarg}, max_workers={4 if req.disable_hf_transfer else 8})"')
            runner_lines.append('else')
            runner_lines.append('  echo "Installing huggingface-hub and dependencies..."')
            runner_lines.append('  pip install --no-deps -q huggingface-hub 2>/dev/null')
-            runner_lines.append('  pip install -q filelock fsspec packaging pyyaml tqdm typer httpx requests hf_transfer 2>/dev/null')
-            runner_lines.append("  python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
-            runner_lines.append(f'  python3 -c "import os; from huggingface_hub import snapshot_download; snapshot_download(\'{req.repo_id}\'{_dl_pyarg}, max_workers=8)"')
+            if req.disable_hf_transfer:
+                runner_lines.append('  pip install -q filelock fsspec packaging pyyaml tqdm typer httpx requests 2>/dev/null')
+                runner_lines.append('  export HF_HUB_ENABLE_HF_TRANSFER=0')
+            else:
+                runner_lines.append('  pip install -q filelock fsspec packaging pyyaml tqdm typer httpx requests hf_transfer 2>/dev/null')
+                runner_lines.append("  python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
+            runner_lines.append(f'  python3 -c "import os; from huggingface_hub import snapshot_download; snapshot_download(\'{req.repo_id}\'{_dl_pyarg}, max_workers={4 if req.disable_hf_transfer else 8})"')
            runner_lines.append('fi')
-            runner_lines.append('if [ $? -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $?)"; fi')
+            runner_lines.append('_ec=$?; if [ $_ec -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $_ec)"; fi')
            runner_lines.append(f"rm -f {remote_runner}")
            runner_lines.append('exec "${SHELL:-/bin/bash}"')
            runner_path = TMUX_LOG_DIR / f"{session_id}_run.sh"
@@ -586,11 +615,11 @@ def setup_cookbook_routes() -> APIRouter:
                # Detached path: no controlling TTY, so skip `< /dev/null`
                # (handled by Popen stdin=DEVNULL) and don't keep a shell open.
                lines.append(hf_cmd)
-                lines.append('if [ $? -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $?)"; fi')
+                lines.append('_ec=$?; if [ $_ec -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $_ec)"; fi')
            else:
                # < /dev/null suppresses interactive "update available? [Y/n]" prompt
                lines.append(f"{hf_cmd} < /dev/null")
-                lines.append('if [ $? -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $?)"; fi')
+                lines.append('_ec=$?; if [ $_ec -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $_ec)"; fi')
                lines.append(f"rm -f '{wrapper_script}'")
                lines.append('exec "${SHELL:-/bin/bash}"')
                wrapper_script.write_text("\n".join(lines) + "\n", encoding="utf-8")
@@ -672,11 +701,14 @@ def setup_cookbook_routes() -> APIRouter:
                cwd=str(Path.home()),
            )
        else:
-            # LOCAL scan: run the interpreter directly. `python3` isn't a thing on
-            # Windows (it's `python`/`py`), and shell single-quoting of the path
-            # doesn't survive cmd.exe — so resolve the interpreter and exec it
-            # with the script path as an argv element (no shell quoting needed).
-            local_py = (
+            # LOCAL scan: use sys.executable (the venv Python Odysseus is already
+            # running under) — it's guaranteed real Python on all platforms.
+            # Falling back to which_tool on Windows risks hitting the Microsoft
+            # Store stub alias for "python3"/"python", which prints
+            # "Python was not found; run without arguments to install from the
+            # Microsoft Store" and exits 9009, producing empty stdout and a
+            # JSON parse error. sys.executable bypasses PATH entirely.
+            local_py = sys.executable or (
                which_tool("python3") or which_tool("python")
                or which_tool("py") or "python"
            )
@@ -714,6 +746,8 @@ def setup_cookbook_routes() -> APIRouter:
                    entry["backend"] = m.get("backend")
                if m.get("is_ollama"):
                    entry["is_ollama"] = True
+                if isinstance(m.get("gguf_files"), list):
+                    entry["gguf_files"] = m["gguf_files"]
                models.append(entry)
        except Exception as e:
            logger.warning(f"Failed to parse cached models: {e}")
@@ -775,6 +809,80 @@ def setup_cookbook_routes() -> APIRouter:
        finally:
            db.close()

+    def _auto_register_llm_endpoint(req: ServeRequest, remote: str | None) -> str | None:
+        """Register a freshly-served LLM as a model endpoint so it appears in the
+        model picker without a manual /setup step — the text-model sibling of
+        _auto_register_image_endpoint.
+
+        Cookbook serve commands launch an OpenAI-compatible server (llama.cpp's
+        llama-server, vLLM, SGLang, or Ollama) on a known port. We point an
+        endpoint at that server's /v1; the picker auto-discovers the model id by
+        probing /v1/models and dims the endpoint until the server is reachable,
+        so registering immediately (before the server finishes loading) is safe.
+        """
+        import re
+        from core.database import SessionLocal, ModelEndpoint
+
+        # Port: an explicit --port wins. Otherwise fall back by backend — Ollama
+        # is the only server in our generated commands that omits --port.
+        port_match = re.search(r'--port\s+(\d+)', req.cmd)
+        if port_match:
+            port = int(port_match.group(1))
+        elif "ollama" in req.cmd:
+            port = 11434
+        else:
+            port = 8080  # llama.cpp's llama-server default — the Apple Silicon path
+
+        # Determine host (mirrors the image path: SSH alias for remote serves).
+        if remote:
+            host = remote.split("@")[-1] if "@" in remote else remote
+        else:
+            host = "localhost"
+
+        base_url = f"http://{host}:{port}/v1"
+
+        short_name = req.repo_id.split("/")[-1] if "/" in req.repo_id else req.repo_id
+        display_name = short_name or "Local model"
+
+        # If the serve command opts models into OpenAI tool-calling, record it so
+        # agent_loop trusts emitted tool_calls instead of the name heuristic.
+        supports_tools = True if "--enable-auto-tool-choice" in req.cmd else None
+
+        db = SessionLocal()
+        try:
+            # Reuse an endpoint already pointed at this URL instead of duplicating.
+            existing = db.query(ModelEndpoint).filter(ModelEndpoint.base_url == base_url).first()
+            if existing:
+                existing.is_enabled = True
+                existing.model_type = "llm"
+                existing.name = display_name
+                if supports_tools is not None:
+                    existing.supports_tools = supports_tools
+                db.commit()
+                logger.info(f"Updated existing local model endpoint: {base_url}")
+                return existing.id
+
+            ep_id = f"local-{uuid.uuid4().hex[:8]}"
+            ep = ModelEndpoint(
+                id=ep_id,
+                name=display_name,
+                base_url=base_url,
+                api_key=None,
+                is_enabled=True,
+                model_type="llm",
+                supports_tools=supports_tools,
+            )
+            db.add(ep)
+            db.commit()
+            logger.info(f"Auto-registered local model endpoint: {display_name} @ {base_url}")
+            return ep_id
+        except Exception as e:
+            logger.error(f"Failed to auto-register local model endpoint: {e}")
+            db.rollback()
+            return None
+        finally:
+            db.close()
+
    @router.post("/api/model/serve")
    async def model_serve(request: Request, req: ServeRequest):
        """Launch a model server in a tmux session (or PowerShell background process on Windows).
@@ -800,8 +908,17 @@ def setup_cookbook_routes() -> APIRouter:
        # many downstream `"engine" in req.cmd` membership checks can't hit
        # `TypeError: argument of type 'NoneType'` (a 500 instead of a clean 400).
        req.cmd = _validate_serve_cmd(req.cmd) or ""
+        req.cmd = _venv_safe_local_pip_install_cmd(
+            req.cmd,
+            local=not bool(req.remote_host),
+            in_venv=sys.prefix != sys.base_prefix,
+        )
        is_pip_install = bool(req.cmd and "pip install" in req.cmd)
        if is_pip_install:
+            # Keep big dependency wheel builds (vLLM, …) off the home filesystem's
+            # pip cache so they don't fail mid-build with "No space left" (#1219)
+            # and leave the dep installed-but-unusable (#1459).
+            req.cmd = _pip_install_no_cache(req.cmd)
            # PEP-508-style package spec — letters, digits, `.-_` for the
            # name; `[` `]` for extras; `<>=!~,` for version specifiers.
            # v2 review HIGH-14: tightened from the previous regex which
@@ -922,7 +1039,7 @@ def setup_cookbook_routes() -> APIRouter:
                runner_lines.append('  if ! python3 -c "import llama_cpp" 2>/dev/null; then')
                runner_lines.append('    pkg install -y cmake 2>/dev/null')
                runner_lines.append('    pip install numpy diskcache jinja2 2>/dev/null')
-                runner_lines.append('    CMAKE_ARGS="-DGGML_BLAS=OFF -DGGML_LLAMAFILE=OFF" pip install llama-cpp-python --no-build-isolation --no-cache-dir 2>&1 || true')
+                runner_lines.append('    CMAKE_ARGS="-DGGML_BLAS=OFF -DGGML_LLAMAFILE=OFF" pip install \'llama-cpp-python[server]\' --no-build-isolation --no-cache-dir 2>&1 || true')
                runner_lines.append('  fi')
                runner_lines.append('elif ! command -v llama-server &>/dev/null; then')
                runner_lines.append('  echo "Native llama-server not found — building from source (one-time, may take a few minutes)..."')
@@ -944,61 +1061,45 @@ def setup_cookbook_routes() -> APIRouter:
                runner_lines.append('      && cmake --build build -j"$NPROC" --target llama-server \\')
                runner_lines.append('      && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
                runner_lines.append('  else')
-                # Detect pip-installed nvcc (from vLLM/nvidia CUDA wheels) and put
-                # it on PATH so cmake's CUDA configure can find it.  We check the
-                # same three layouts as entrypoint.sh:
-                #   nvidia/cu13       — nvidia-nvcc-cu13
-                #   nvidia/cu12       — nvidia-nvcc-cu12
-                #   nvidia/cuda_nvcc  — nvidia-cuda-nvcc-cu12 (sub-package style)
-                runner_lines.append('    for _cudir in ~/.local/lib/python*/site-packages/nvidia/cu13 ~/.local/lib/python*/site-packages/nvidia/cu12 ~/.local/lib/python*/site-packages/nvidia/cuda_nvcc; do')
-                runner_lines.append('      [ -x "$_cudir/bin/nvcc" ] && export CUDA_HOME="$_cudir" && export PATH="$_cudir/bin:$PATH" && break')
-                runner_lines.append('    done')
-                # rm -rf build so a prior poisoned CMakeCache.txt (e.g. from a
-                # failed CUDA attempt) doesn't cause the next configure to reuse
-                # stale settings and silently produce a CPU-only binary.
-                runner_lines.append('    cd ~/llama.cpp && rm -rf build')
-                runner_lines.append('    if command -v nvcc &>/dev/null; then')
-                runner_lines.append('      echo "[odysseus] CUDA nvcc found — building llama-server with CUDA (GPU) support..."')
-                runner_lines.append('      cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON \\')
-                runner_lines.append('        && cmake --build build -j"$NPROC" --target llama-server \\')
-                runner_lines.append('        && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
-                runner_lines.append('    else')
-                runner_lines.append('      echo "[odysseus] WARNING: nvcc not found — building llama-server for CPU only."')
-                runner_lines.append('      echo "[odysseus]   GPU inference will not be available for this llama.cpp build."')
-                runner_lines.append('      echo "[odysseus]   To get a GPU build, first install vLLM via Cookbook -> Dependencies"')
-                runner_lines.append('      echo "[odysseus]   (its CUDA wheels include nvcc), then re-launch this serve task."')
-                runner_lines.append('      cmake -B build -DCMAKE_BUILD_TYPE=Release \\')
-                runner_lines.append('        && cmake --build build -j"$NPROC" --target llama-server \\')
-                runner_lines.append('        && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
-                runner_lines.append('    fi')
+                _append_llama_cpp_linux_accel_build_lines(runner_lines)
                runner_lines.append('  fi')
                runner_lines.append('  # If the native build failed, fall back to the Python bindings.')
                runner_lines.append('  if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then')
                runner_lines.append('    echo "llama-server build failed — installing Python bindings as fallback..."')
-                runner_lines.append('    pip install --user --break-system-packages -q llama-cpp-python 2>/dev/null || pip install -q llama-cpp-python 2>/dev/null || true')
+                runner_lines.append(f"    {_pip_install_fallback_chain('llama-cpp-python[server]', python_cmd='pip')} || true")
+                runner_lines.append('  fi')
+                runner_lines.append('  if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then')
+                runner_lines.append('    echo "ERROR: llama.cpp serving is not available after install/build attempts."')
+                runner_lines.append('    ODYSSEUS_PREFLIGHT_EXIT=127')
                runner_lines.append('  fi')
                runner_lines.append('fi')
            elif "ollama" in req.cmd:
                handled_ollama_serve = True
-                _ollama_port = "11434"
-                _ollama_match = re.search(r"OLLAMA_HOST=[^\s:]+:(\d+)", req.cmd)
-                if _ollama_match:
-                    _ollama_port = _ollama_match.group(1)
+                _ollama_default_host = "0.0.0.0" if remote else "127.0.0.1"
+                _ollama_host, _ollama_port = _ollama_bind_from_cmd(
+                    req.cmd,
+                    default_host=_ollama_default_host,
+                )
                # Ollama can be a host binary, a system service, or a Docker
                # container. If the HTTP API is already reachable, the model is
                # already served and we should not require a host `ollama` CLI.
+                runner_lines.append(f'ODYSSEUS_OLLAMA_HOST={_bash_squote(_ollama_host)}')
                runner_lines.append(f'ODYSSEUS_OLLAMA_PORT="{_ollama_port}"')
                runner_lines.append('ODYSSEUS_OLLAMA_URL=""')
-                runner_lines.append('for _ody_ollama_port in "$ODYSSEUS_OLLAMA_PORT" 11434; do')
-                runner_lines.append('  [ -z "$_ody_ollama_port" ] && continue')
-                runner_lines.append('  for _ody_ollama_host in 127.0.0.1 localhost host.docker.internal; do')
-                runner_lines.append('    _ody_ollama_url="http://${_ody_ollama_host}:${_ody_ollama_port}"')
-                runner_lines.append('    if curl -sf "$_ody_ollama_url/api/tags" >/dev/null 2>&1; then')
-                runner_lines.append('      ODYSSEUS_OLLAMA_URL="$_ody_ollama_url"')
-                runner_lines.append('      ODYSSEUS_OLLAMA_PORT="$_ody_ollama_port"')
-                runner_lines.append('      break 2')
-                runner_lines.append('    fi')
+                runner_lines.append('for _ody_ollama_try in $(seq 1 20); do')
+                runner_lines.append('  for _ody_ollama_port in "$ODYSSEUS_OLLAMA_PORT" 11434; do')
+                runner_lines.append('    [ -z "$_ody_ollama_port" ] && continue')
+                runner_lines.append('    for _ody_ollama_host in 127.0.0.1 localhost host.docker.internal; do')
+                runner_lines.append('      _ody_ollama_url="http://${_ody_ollama_host}:${_ody_ollama_port}"')
+                runner_lines.append('      if curl -sf "$_ody_ollama_url/api/tags" >/dev/null 2>&1; then')
+                runner_lines.append('        ODYSSEUS_OLLAMA_URL="$_ody_ollama_url"')
+                runner_lines.append('        ODYSSEUS_OLLAMA_PORT="$_ody_ollama_port"')
+                runner_lines.append('        break 3')
+                runner_lines.append('      fi')
+                runner_lines.append('    done')
                runner_lines.append('  done')
+                runner_lines.append('  [ "$_ody_ollama_try" -eq 1 ] && echo "[odysseus] Waiting for an existing Ollama API on ports ${ODYSSEUS_OLLAMA_PORT}/11434..."')
+                runner_lines.append('  sleep 1')
                runner_lines.append('done')
                runner_lines.append('if [ -n "$ODYSSEUS_OLLAMA_URL" ]; then')
                runner_lines.append('  if [ "$ODYSSEUS_OLLAMA_PORT" != "' + _ollama_port + '" ]; then')
@@ -1015,8 +1116,12 @@ def setup_cookbook_routes() -> APIRouter:
                runner_lines.append('  echo "=== Process exited with code 127 ==="')
                runner_lines.append('  exec bash -i')
                runner_lines.append('fi')
-                runner_lines.append('echo "Starting ollama server on 0.0.0.0:${ODYSSEUS_OLLAMA_PORT}..."')
-                runner_lines.append('OLLAMA_HOST="0.0.0.0:${ODYSSEUS_OLLAMA_PORT}" ollama serve')
+                runner_lines.append('ODYSSEUS_OLLAMA_URL="http://${ODYSSEUS_OLLAMA_HOST}:${ODYSSEUS_OLLAMA_PORT}"')
+                if remote and _ollama_host in ("0.0.0.0", "::"):
+                    runner_lines.append('echo "[odysseus] WARNING: remote Ollama will bind to ${ODYSSEUS_OLLAMA_HOST}:${ODYSSEUS_OLLAMA_PORT} so Odysseus can reach it from this host."')
+                    runner_lines.append('echo "[odysseus] Ollama has no built-in authentication; expose this only on a trusted LAN/VPN or provide an explicit OLLAMA_HOST with your own access controls."')
+                runner_lines.append('echo "Starting ollama server on ${ODYSSEUS_OLLAMA_HOST}:${ODYSSEUS_OLLAMA_PORT}..."')
+                runner_lines.append('OLLAMA_HOST="${ODYSSEUS_OLLAMA_HOST}:${ODYSSEUS_OLLAMA_PORT}" ollama serve')
                runner_lines.append('_ody_exit=$?')
                runner_lines.append('echo')
                runner_lines.append('echo "=== Process exited with code ${_ody_exit} ==="')
@@ -1032,19 +1137,24 @@ def setup_cookbook_routes() -> APIRouter:
                # find the `vllm` CLI ("command not found"). Mirrors llama.cpp above.
                runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
                runner_lines.append('if ! command -v vllm &>/dev/null; then')
-                runner_lines.append('  echo "ERROR: vLLM is not installed. Open Cookbook -> Dependencies and install vllm on this server, then launch again."')
+                runner_lines.append('  echo "ERROR: vLLM is not installed."')
                runner_lines.append('  ODYSSEUS_PREFLIGHT_EXIT=127')
                runner_lines.append('fi')
            elif "sglang.launch_server" in req.cmd:
                runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
-                runner_lines.append('if ! python3 -c "import sglang" 2>/dev/null; then')
-                runner_lines.append('  echo "ERROR: SGLang is not installed. Open Cookbook -> Dependencies and install sglang on this server, then launch again."')
+                runner_lines.append('if ! command -v sglang &>/dev/null; then')
+                runner_lines.append('  echo "ERROR: SGLang is not installed."')
+                runner_lines.append('  ODYSSEUS_PREFLIGHT_EXIT=127')
+                runner_lines.append('elif ! ODYSSEUS_SGLANG_IMPORT_ERROR="$(python3 -c "import sglang" 2>&1)"; then')
+                runner_lines.append('  echo "ERROR: SGLang is installed but failed to import."')
+                runner_lines.append('  printf "%s\\n" "$ODYSSEUS_SGLANG_IMPORT_ERROR"')
                runner_lines.append('  ODYSSEUS_PREFLIGHT_EXIT=127')
                runner_lines.append('fi')
            elif "scripts/diffusion_server.py" in req.cmd or ".diffusion_server.py" in req.cmd:
                runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
-                runner_lines.append('if ! python3 -c "import torch, diffusers" 2>/dev/null; then')
-                runner_lines.append('  echo "ERROR: Diffusion serving requires PyTorch + diffusers. Open Cookbook -> Dependencies and install diffusers on this server, then launch again."')
+                runner_lines.append('if ! ODYSSEUS_DIFFUSION_IMPORT_ERROR="$(python3 -c "import torch, diffusers" 2>&1)"; then')
+                runner_lines.append('  echo "ERROR: Diffusion serving requires PyTorch + diffusers."')
+                runner_lines.append('  printf "%s\\n" "$ODYSSEUS_DIFFUSION_IMPORT_ERROR"')
                runner_lines.append('  ODYSSEUS_PREFLIGHT_EXIT=127')
                runner_lines.append('fi')

@@ -1116,11 +1226,16 @@ def setup_cookbook_routes() -> APIRouter:
                stderr = (await proc.stderr.read()).decode(errors="replace")
                return {"ok": False, "error": stderr, "session_id": session_id}

-        # Auto-register as model endpoint if serving a diffusion model
+        # Auto-register a model endpoint so the served model shows up in the model
+        # picker with no manual /setup step. Diffusion models get an image
+        # endpoint; any other real model serve (i.e. not a pip-install task) gets
+        # a local LLM endpoint pointed at its /v1.
        endpoint_id = None
        is_diffusion = "diffusion_server.py" in req.cmd
        if is_diffusion:
            endpoint_id = _auto_register_image_endpoint(req, remote)
+        elif not is_pip_install:
+            endpoint_id = _auto_register_llm_endpoint(req, remote)

        # Log to assistant
        try:
@@ -1357,9 +1472,16 @@ def setup_cookbook_routes() -> APIRouter:
            total_mb = max(0, int(total_bytes / (1024 * 1024)))
            used_mb = max(0, min(total_mb, int(used_bytes / (1024 * 1024))))
            free_mb = max(0, total_mb - used_mb)
+            # GTT = the system-RAM pool the GPU pages into when VRAM is full.
+            # On a discrete card a large gtt_used means the model spilled past
+            # VRAM into RAM over PCIe — much slower. Surface it so the UI can
+            # warn "spilling to RAM" instead of the user wondering why it's slow.
+            gtt_used_raw = await _gpu_read_file(f"{base}/mem_info_gtt_used", host, ssh_port)
+            gtt_used_mb = max(0, int(int(gtt_used_raw) / (1024 * 1024))) if (gtt_used_raw and gtt_used_raw.isdigit()) else 0
            gpus.append({
                "index": len(gpus), "name": name, "uuid": entry,
                "free_mb": free_mb, "total_mb": total_mb, "used_mb": used_mb,
+                "gtt_used_mb": gtt_used_mb,
                "util_pct": 0, "busy": bool(total_mb and (free_mb / total_mb) < 0.85),
                "processes": [], "backend": "rocm", "source": "amd-sysfs",
                "unified_memory": unified,
@@ -1461,6 +1583,46 @@ def setup_cookbook_routes() -> APIRouter:
        if gpus:
            return {"ok": True, "gpus": gpus, "backend": "cuda", "source": "nvidia-smi"}

+        # Local Apple Silicon / Metal fallback. macOS has no nvidia-smi and no
+        # Linux /sys/class/drm tree, but services.hwfit.hardware already knows
+        # how to size the shared unified-memory GPU budget. Keep this route in
+        # sync so Cookbook's GPU picker doesn't show "nvidia-smi not found" on
+        # native Mac launches.
+        if not host and sys.platform == "darwin":
+            try:
+                from services.hwfit.hardware import detect_system
+                info = detect_system(fresh=True)
+                backend = str(info.get("backend") or "").lower()
+                if backend in {"metal", "mps", "apple"} and info.get("gpu_count", 0) > 0:
+                    total_mb = int(float(info.get("gpu_vram_gb") or info.get("total_ram_gb") or 0) * 1024)
+                    free_mb = int(float(info.get("available_ram_gb") or 0) * 1024)
+                    if total_mb and (free_mb <= 0 or free_mb > total_mb):
+                        free_mb = total_mb
+                    used_mb = max(0, total_mb - max(0, free_mb))
+                    return {
+                        "ok": True,
+                        "gpus": [{
+                            "index": 0,
+                            "name": info.get("gpu_name") or info.get("cpu_name") or "Apple Silicon GPU",
+                            "uuid": "apple-metal-0",
+                            "free_mb": max(0, free_mb),
+                            "total_mb": max(0, total_mb),
+                            "used_mb": used_mb,
+                            "util_pct": 0,
+                            "busy": bool(total_mb and (free_mb / total_mb) < 0.5),
+                            "processes": [],
+                            "backend": "metal",
+                            "source": "apple-metal",
+                            "unified_memory": True,
+                        }],
+                        "backend": "metal",
+                        "source": "apple-metal",
+                        "fallback_from": "nvidia-smi",
+                        "nvidia_error": nvidia_error,
+                    }
+            except Exception as e:
+                logger.warning("Apple Metal GPU fallback failed: %s", e)
+
        amd_gpus = await _probe_amd_sysfs(host, ssh_port)
        if amd_gpus:
            return {
@@ -1607,6 +1769,33 @@ def setup_cookbook_routes() -> APIRouter:

            disk_tasks = on_disk.get("tasks") or [] if isinstance(on_disk, dict) else []
            incoming_tasks = data.get("tasks") if isinstance(data.get("tasks"), list) else []
+            # Anti-poisoning guard: a stale browser tab can keep POSTing a
+            # download task as status='done' from before the strict-finish
+            # fix landed, undoing any server-side correction. For each
+            # incoming "done" download, override to "running" if the last
+            # shard pattern says N<total AND no DOWNLOAD_OK/DOWNLOAD_FAILED/
+            # /snapshots/ sentinel is in the output.
+            import re as _re_dl
+            for _it in incoming_tasks:
+                if (not isinstance(_it, dict)) or _it.get("type") != "download" or _it.get("status") != "done":
+                    continue
+                _out = _it.get("output") or ""
+                if ("DOWNLOAD_OK" in _out) or ("DOWNLOAD_FAILED" in _out) or ("/snapshots/" in _out):
+                    continue
+                _shards = _re_dl.findall(r"model-(\d+)-of-(\d+)\.safetensors", _out)
+                if _shards:
+                    _n, _tot = _shards[-1]
+                    if int(_n) < int(_tot):
+                        logger.info(f"cookbook state POST: rejecting stale done for {_it.get('sessionId')} "
+                                    f"(last shard {_n}/{_tot}, no DOWNLOAD_OK)")
+                        _it["status"] = "running"
+                else:
+                    _completed = _out.count("Download complete")
+                    _starts = _out.count("Downloading '")
+                    if _starts > _completed:
+                        logger.info(f"cookbook state POST: rejecting stale done for {_it.get('sessionId')} "
+                                    f"({_completed}/{_starts} files complete, no DOWNLOAD_OK)")
+                        _it["status"] = "running"
            incoming_ids = {t.get("sessionId") for t in incoming_tasks if isinstance(t, dict) and t.get("sessionId")}
            import time as _t
            now_ms = int(_t.time() * 1000)
@@ -1763,6 +1952,43 @@ def setup_cookbook_routes() -> APIRouter:
    def _cookbook_tasks_status_sync():
        import subprocess

+        def _download_cache_complete(repo_id: str, remote_host: str = "", ssh_port: str = "") -> bool:
+            """Best-effort check for a completed HF cache entry.
+
+            tmux output can stop at a stale progress line if the pane/session
+            disappears before Cookbook captures the final DOWNLOAD_OK marker.
+            In that case, trust the cache shape: a snapshot directory with files
+            and no *.incomplete blobs means HuggingFace finished materializing the
+            model.
+            """
+            if not repo_id or "/" not in repo_id:
+                return False
+            py = (
+                "import os,sys;"
+                "repo=sys.argv[1];"
+                "base=os.environ.get('HUGGINGFACE_HUB_CACHE') or os.path.join(os.environ.get('HF_HOME', os.path.expanduser('~/.cache/huggingface')), 'hub');"
+                "d=os.path.join(base,'models--'+repo.replace('/','--'));"
+                "snap=os.path.join(d,'snapshots');"
+                "ok=os.path.isdir(snap) and any(os.path.isdir(os.path.join(snap,x)) and os.listdir(os.path.join(snap,x)) for x in os.listdir(snap));"
+                "inc=False;"
+                "blobs=os.path.join(d,'blobs');"
+                "inc=os.path.isdir(blobs) and any(x.endswith('.incomplete') for x in os.listdir(blobs));"
+                "sys.exit(0 if ok and not inc else 1)"
+            )
+            cmd = ["python3", "-c", py, repo_id]
+            try:
+                if remote_host:
+                    ssh_base = ["ssh"]
+                    if ssh_port and ssh_port != "22":
+                        ssh_base.extend(["-p", str(ssh_port)])
+                    shell_cmd = " ".join(shlex.quote(x) for x in cmd)
+                    proc = subprocess.run(ssh_base + [remote_host, shell_cmd], timeout=12, capture_output=True)
+                else:
+                    proc = subprocess.run(cmd, timeout=12, capture_output=True)
+                return proc.returncode == 0
+            except Exception:
+                return False
+
        # Load saved tasks from cookbook state
        tasks = []
        if _cookbook_state_path.exists():
@@ -1902,14 +2128,21 @@ def setup_cookbook_routes() -> APIRouter:
            # persists after the process exits, so a finished download still has a
            # snapshot to classify (DOWNLOAD_OK / exit marker) — evaluate it even
            # when the PID is gone instead of blindly reporting "stopped".
+            download_zero_files = False
            status = "unknown"
            if is_alive or (local_win_task and full_snapshot):
                lower = full_snapshot.lower()
-                has_exit = "=== process exited with code" in lower
+                exit_match = re.search(r"=== process exited with code\s+(-?\d+)", full_snapshot, re.I)
+                has_exit = exit_match is not None
+                exit_code = int(exit_match.group(1)) if exit_match else None
                has_error = "error" in lower or "failed" in lower or "traceback" in lower
                if has_exit and task_type == "serve":
                    # Serve tasks that exit are always errors — they should run indefinitely
                    status = "error"
+                elif has_exit and task_type == "download":
+                    # Dependency installs are tracked as download tasks but only
+                    # emit the generic runner exit marker, not HF download markers.
+                    status = "completed" if exit_code == 0 else "error"
                elif has_exit and "unrecognized arguments" in lower:
                    status = "error"
                elif has_error and not ("application startup complete" in lower):
@@ -1918,7 +2151,11 @@ def setup_cookbook_routes() -> APIRouter:
                    # Only download tasks treat 100% as "completed".
                    # Serve tasks log 100%|██████| during inference progress
                    # (diffusion sampling, etc.) — that's "running", not done.
-                    status = "completed"
+                    if re.search(r"Fetching\s+0\s+files", full_snapshot, re.IGNORECASE):
+                        status = "error"
+                        download_zero_files = True
+                    else:
+                        status = "completed"
                elif "application startup complete" in lower:
                    status = "ready"
                elif not is_alive:
@@ -1928,7 +2165,14 @@ def setup_cookbook_routes() -> APIRouter:
                    status = "running"
            else:
                # Session is dead — check if it completed or crashed
-                status = "stopped"
+                if task_type == "download" and _download_cache_complete(_payload.get("repo_id") or model, remote, str(_tport or "")):
+                    status = "completed"
+                    if not progress_text:
+                        progress_text = "Download complete"
+                    if not full_snapshot:
+                        full_snapshot = "DOWNLOAD_OK"
+                else:
+                    status = "stopped"

            # Parse structured phase info — single source of truth for the UI
            phase_info = _parse_serve_phase(full_snapshot, task_type) if (task_type == "serve" and status == "running" and full_snapshot) else {}
@@ -1938,6 +2182,8 @@ def setup_cookbook_routes() -> APIRouter:
            diagnosis = _diagnose_serve_output(full_snapshot) if task_type == "serve" and full_snapshot else None
            if diagnosis and status in {"running", "unknown", "stopped"}:
                status = "error"
+            if download_zero_files:
+                diagnosis = {"message": "No matching files were downloaded. The model repo or filename/quant pattern may be wrong (for example a ':Q4_K_M' tag that does not exist in the repo). Check the repo and the include/quant pattern."}
            output_tail = "\n".join(full_snapshot.splitlines()[-12:]) if full_snapshot else ""

            results.append({