diff --git a/routes/cookbook_routes.py b/routes/cookbook_routes.py index 9ba054b32..3acc7fdb0 100644 --- a/routes/cookbook_routes.py +++ b/routes/cookbook_routes.py @@ -866,17 +866,45 @@ def setup_cookbook_routes() -> APIRouter: runner_lines.append(' echo "Native llama-server not found — building from source (one-time, may take a few minutes)..."') runner_lines.append(' mkdir -p ~/bin') runner_lines.append(' cd ~ && [ -d llama.cpp ] || git clone --depth 1 https://github.com/ggml-org/llama.cpp') - # GPU build if CUDA is present; fall back to a plain (CPU) build. - runner_lines.append(' cd ~/llama.cpp && { cmake -B build -DGGML_CUDA=ON 2>/dev/null || cmake -B build; } \\') - runner_lines.append(' && cmake --build build -j"$(nproc)" --target llama-server \\') - runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server') + # Build with the right accelerator: Metal on macOS (llama.cpp + # enables it automatically, no flag), CUDA on Linux when present, + # else a plain CPU build. nproc is Linux-only — fall back to + # `sysctl hw.ncpu` on macOS. (Tip: `brew install llama.cpp` ships + # a prebuilt llama-server and skips this whole source build.) + runner_lines.append(' NPROC="$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)"') + runner_lines.append(' if [ "$(uname -s)" = "Darwin" ]; then') + runner_lines.append(' cd ~/llama.cpp && cmake -B build \\') + runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\') + runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server') + runner_lines.append(' else') + runner_lines.append(' cd ~/llama.cpp && { cmake -B build -DGGML_CUDA=ON 2>/dev/null || cmake -B build; } \\') + runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\') + runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server') + runner_lines.append(' fi') runner_lines.append(' # If the native build failed, fall back to the Python bindings.') runner_lines.append(' if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then') runner_lines.append(' echo "llama-server build failed — installing Python bindings as fallback..."') runner_lines.append(' pip install --user --break-system-packages -q llama-cpp-python 2>/dev/null || pip install -q llama-cpp-python 2>/dev/null || true') runner_lines.append(' fi') runner_lines.append('fi') + elif "ollama" in req.cmd: + # Ollama manages its own model store and HTTP server. Just make + # sure the binary exists and the daemon is up before running the + # command (the natural serving engine on Apple Silicon / Metal). + runner_lines.append('if ! command -v ollama &>/dev/null; then') + runner_lines.append(' echo "ERROR: Ollama not found. Install it (macOS: brew install ollama, or https://ollama.com/download), then launch again."') + runner_lines.append(' exit 127') + runner_lines.append('fi') + runner_lines.append('if ! curl -sf http://localhost:11434/api/tags >/dev/null 2>&1; then') + runner_lines.append(' echo "Starting ollama server..."; (ollama serve >/dev/null 2>&1 &)') + runner_lines.append(' for _ in 1 2 3 4 5 6 7 8 9 10; do curl -sf http://localhost:11434/api/tags >/dev/null 2>&1 && break; sleep 1; done') + runner_lines.append('fi') elif "vllm serve" in req.cmd: + # vLLM is CUDA/ROCm-only and does not run on macOS at all. + runner_lines.append('if [ "$(uname -s)" = "Darwin" ]; then') + runner_lines.append(' echo "ERROR: vLLM does not run on macOS. Use Ollama or llama.cpp (Metal) instead."') + runner_lines.append(' exit 1') + runner_lines.append('fi') # Put ~/.local/bin on PATH first — without a venv, vllm installs # there via --user and the non-login serve shell otherwise can't # find the `vllm` CLI ("command not found"). Mirrors llama.cpp above. diff --git a/scripts/odysseus-cookbook b/scripts/odysseus-cookbook index 57edbce42..845a2db2d 100755 --- a/scripts/odysseus-cookbook +++ b/scripts/odysseus-cookbook @@ -95,21 +95,89 @@ def cmd_list(args) -> None: # ─── gpus ──────────────────────────────────────────────────────────── +def _macos_metal_gpu() -> list | None: + """Apple Silicon has no discrete VRAM — report total unified memory as the + GPU budget so the web UI's picker shows the Mac's Metal GPU instead of + 'no GPU'. `free` is approximated from vm_stat (page-granular); macOS doesn't + expose Metal utilization to the shell, so util is 0. Returns None off macOS.""" + if sys.platform != "darwin": + return None + + def _sysctl(key: str) -> str | None: + try: + r = subprocess.run(["sysctl", "-n", key], capture_output=True, text=True, timeout=5) + return r.stdout.strip() if r.returncode == 0 else None + except Exception: + return None + + memsize = _sysctl("hw.memsize") + if not memsize or not memsize.isdigit(): + return None + total_mb = int(memsize) // (1024 * 1024) + name = _sysctl("machdep.cpu.brand_string") or "Apple Silicon" + + free_mb = total_mb + try: + vm = subprocess.run(["vm_stat"], capture_output=True, text=True, timeout=5) + if vm.returncode == 0: + page_size, pages = 4096, {} + for line in vm.stdout.splitlines(): + if "page size of" in line: + m = re.search(r"page size of (\d+)", line) + if m: + page_size = int(m.group(1)) + elif ":" in line: + k, v = line.split(":", 1) + v = v.strip().rstrip(".") + if v.isdigit(): + pages[k.strip()] = int(v) + free_pages = (pages.get("Pages free", 0) + pages.get("Pages inactive", 0) + + pages.get("Pages speculative", 0)) + if free_pages: + free_mb = (free_pages * page_size) // (1024 * 1024) + except Exception: + pass + + return [{ + "index": 0, + "name": name, + "free_mb": free_mb, + "total_mb": total_mb, + "used_mb": max(0, total_mb - free_mb), + "util_pct": 0, + "uuid": "apple-metal-0", + "unified_memory": True, + "busy": (free_mb / total_mb) < 0.5 if total_mb else False, + }] + + def cmd_gpus(args) -> None: """Same shape the web UI gets — index/name/free_mb/total_mb/used_mb/ - util_pct/uuid. Returns `[]` with an `error` field if nvidia-smi is - missing (laptop / CPU-only box). Pass `--host user@box` to run over - SSH against a remote machine.""" + util_pct/uuid. On Apple Silicon (no nvidia-smi) reports the Metal GPU's + unified memory instead. Returns `[]` with an `error` field only on a + CPU-only non-Mac box. Pass `--host user@box` to run over SSH.""" query = "nvidia-smi --query-gpu=index,name,memory.free,memory.total,memory.used,utilization.gpu,uuid --format=csv,noheader,nounits" prefix = _ssh_prefix(args.host, args.ssh_port) cmd = prefix + (query.split() if not prefix else [query]) try: out = subprocess.run(cmd, capture_output=True, text=True, timeout=15) except FileNotFoundError: + # No nvidia-smi locally → try the Metal fallback before giving up. + if not prefix: + mac = _macos_metal_gpu() + if mac is not None: + emit({"ok": True, "gpus": mac, "backend": "metal"}, args) + return msg = "ssh not found" if prefix else "nvidia-smi not found" emit({"ok": False, "error": msg, "gpus": []}, args) return if out.returncode != 0: + # nvidia-smi present but errored (or no NVIDIA GPU) — fall back to Metal. + if not prefix: + mac = _macos_metal_gpu() + if mac is not None: + emit({"ok": True, "gpus": mac, "backend": "metal"}, args) + return emit({"ok": False, "error": out.stderr.strip()[:200], "gpus": []}, args) return gpus = [] diff --git a/static/js/cookbook.js b/static/js/cookbook.js index b4802fc34..76bcc9ef4 100644 --- a/static/js/cookbook.js +++ b/static/js/cookbook.js @@ -171,6 +171,13 @@ export function _isWindows(hostOrTask) { return _getPlatform(hostOrTask) === 'windows'; } +/** Check if the detected (local) hardware is Apple Silicon / Metal. Keys off the + * hardware probe's backend rather than a platform string, since a local Mac + * reports no platform but does report backend: "metal". */ +export function _isMetal() { + return ['metal', 'mps', 'apple'].includes(String(_hwfitCache?.system?.backend || '').toLowerCase()); +} + /** Detect model-specific vLLM optimizations */ function _detectModelOptimizations(modelName) { const n = (modelName || '').toLowerCase(); @@ -252,6 +259,13 @@ export function _detectBackend(model) { return { backend: 'llamacpp', label: 'llama.cpp' }; } + // Apple Silicon (Metal) → llama.cpp (GGUF). vLLM/SGLang are CUDA/ROCm-only and + // don't run on macOS; AWQ/GPTQ/FP8 (vLLM-only) models are already filtered out + // of metal Cookbook results, so llama.cpp is always the right engine here. + if (['metal', 'mps', 'apple'].includes(sysBackend)) { + return { backend: 'llamacpp', label: 'llama.cpp' }; + } + // AWQ / GPTQ / FP8 → vLLM if (/^AWQ|^GPTQ/.test(q) || q === 'FP8') { return { backend: 'vllm', label: 'vLLM' }; @@ -1761,6 +1775,7 @@ const shared = { _sshPrefix, _getPlatform, _isWindows, + _isMetal, _buildEnvPrefix, _buildServeCmd, _shellQuote, diff --git a/static/js/cookbookServe.js b/static/js/cookbookServe.js index e343fe6ca..eea2631f4 100644 --- a/static/js/cookbookServe.js +++ b/static/js/cookbookServe.js @@ -16,6 +16,7 @@ let _getPort; let _sshPrefix; let _getPlatform; let _isWindows; +let _isMetal; let _buildEnvPrefix; let _buildServeCmd; let _shellQuote; @@ -382,6 +383,8 @@ function _rerenderCachedModels() { panelHtml += `
`; const _backendChoices = _isWindows() ? [['llamacpp','llama.cpp']] + : _isMetal() + ? [['llamacpp','llama.cpp'],['ollama','Ollama'],['diffusers','Diffusers']] : [['vllm','vLLM'],['sglang','SGLang'],['llamacpp','llama.cpp'],['diffusers','Diffusers']]; const backendOpts = _backendChoices.map(([v,l]) => ``).join(''); panelHtml += ``; @@ -1592,6 +1595,7 @@ export function initServe(shared) { _sshPrefix = shared._sshPrefix; _getPlatform = shared._getPlatform; _isWindows = shared._isWindows; + _isMetal = shared._isMetal; _buildEnvPrefix = shared._buildEnvPrefix; _buildServeCmd = shared._buildServeCmd; _shellQuote = shared._shellQuote;