mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-15 17:25:26 -04:00
Generate macOS/Metal serve commands and surface the Metal GPU
cookbook_routes.py adds a macOS serve path (Ollama, Metal-aware llama.cpp build using `sysctl hw.ncpu` instead of `nproc`, and a clear error if vLLM is attempted). The frontend defaults Metal serving to llama.cpp and offers llama.cpp/Ollama instead of vLLM/SGLang. The odysseus-cookbook CLI's `gpus` command reports the Metal GPU via sysctl/vm_stat. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -866,17 +866,45 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
runner_lines.append(' echo "Native llama-server not found — building from source (one-time, may take a few minutes)..."')
|
||||
runner_lines.append(' mkdir -p ~/bin')
|
||||
runner_lines.append(' cd ~ && [ -d llama.cpp ] || git clone --depth 1 https://github.com/ggml-org/llama.cpp')
|
||||
# GPU build if CUDA is present; fall back to a plain (CPU) build.
|
||||
runner_lines.append(' cd ~/llama.cpp && { cmake -B build -DGGML_CUDA=ON 2>/dev/null || cmake -B build; } \\')
|
||||
runner_lines.append(' && cmake --build build -j"$(nproc)" --target llama-server \\')
|
||||
runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
|
||||
# Build with the right accelerator: Metal on macOS (llama.cpp
|
||||
# enables it automatically, no flag), CUDA on Linux when present,
|
||||
# else a plain CPU build. nproc is Linux-only — fall back to
|
||||
# `sysctl hw.ncpu` on macOS. (Tip: `brew install llama.cpp` ships
|
||||
# a prebuilt llama-server and skips this whole source build.)
|
||||
runner_lines.append(' NPROC="$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)"')
|
||||
runner_lines.append(' if [ "$(uname -s)" = "Darwin" ]; then')
|
||||
runner_lines.append(' cd ~/llama.cpp && cmake -B build \\')
|
||||
runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\')
|
||||
runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
|
||||
runner_lines.append(' else')
|
||||
runner_lines.append(' cd ~/llama.cpp && { cmake -B build -DGGML_CUDA=ON 2>/dev/null || cmake -B build; } \\')
|
||||
runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\')
|
||||
runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
|
||||
runner_lines.append(' fi')
|
||||
runner_lines.append(' # If the native build failed, fall back to the Python bindings.')
|
||||
runner_lines.append(' if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then')
|
||||
runner_lines.append(' echo "llama-server build failed — installing Python bindings as fallback..."')
|
||||
runner_lines.append(' pip install --user --break-system-packages -q llama-cpp-python 2>/dev/null || pip install -q llama-cpp-python 2>/dev/null || true')
|
||||
runner_lines.append(' fi')
|
||||
runner_lines.append('fi')
|
||||
elif "ollama" in req.cmd:
|
||||
# Ollama manages its own model store and HTTP server. Just make
|
||||
# sure the binary exists and the daemon is up before running the
|
||||
# command (the natural serving engine on Apple Silicon / Metal).
|
||||
runner_lines.append('if ! command -v ollama &>/dev/null; then')
|
||||
runner_lines.append(' echo "ERROR: Ollama not found. Install it (macOS: brew install ollama, or https://ollama.com/download), then launch again."')
|
||||
runner_lines.append(' exit 127')
|
||||
runner_lines.append('fi')
|
||||
runner_lines.append('if ! curl -sf http://localhost:11434/api/tags >/dev/null 2>&1; then')
|
||||
runner_lines.append(' echo "Starting ollama server..."; (ollama serve >/dev/null 2>&1 &)')
|
||||
runner_lines.append(' for _ in 1 2 3 4 5 6 7 8 9 10; do curl -sf http://localhost:11434/api/tags >/dev/null 2>&1 && break; sleep 1; done')
|
||||
runner_lines.append('fi')
|
||||
elif "vllm serve" in req.cmd:
|
||||
# vLLM is CUDA/ROCm-only and does not run on macOS at all.
|
||||
runner_lines.append('if [ "$(uname -s)" = "Darwin" ]; then')
|
||||
runner_lines.append(' echo "ERROR: vLLM does not run on macOS. Use Ollama or llama.cpp (Metal) instead."')
|
||||
runner_lines.append(' exit 1')
|
||||
runner_lines.append('fi')
|
||||
# Put ~/.local/bin on PATH first — without a venv, vllm installs
|
||||
# there via --user and the non-login serve shell otherwise can't
|
||||
# find the `vllm` CLI ("command not found"). Mirrors llama.cpp above.
|
||||
|
||||
@@ -95,21 +95,89 @@ def cmd_list(args) -> None:
|
||||
|
||||
# ─── gpus ────────────────────────────────────────────────────────────
|
||||
|
||||
def _macos_metal_gpu() -> list | None:
|
||||
"""Apple Silicon has no discrete VRAM — report total unified memory as the
|
||||
GPU budget so the web UI's picker shows the Mac's Metal GPU instead of
|
||||
'no GPU'. `free` is approximated from vm_stat (page-granular); macOS doesn't
|
||||
expose Metal utilization to the shell, so util is 0. Returns None off macOS."""
|
||||
if sys.platform != "darwin":
|
||||
return None
|
||||
|
||||
def _sysctl(key: str) -> str | None:
|
||||
try:
|
||||
r = subprocess.run(["sysctl", "-n", key], capture_output=True, text=True, timeout=5)
|
||||
return r.stdout.strip() if r.returncode == 0 else None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
memsize = _sysctl("hw.memsize")
|
||||
if not memsize or not memsize.isdigit():
|
||||
return None
|
||||
total_mb = int(memsize) // (1024 * 1024)
|
||||
name = _sysctl("machdep.cpu.brand_string") or "Apple Silicon"
|
||||
|
||||
free_mb = total_mb
|
||||
try:
|
||||
vm = subprocess.run(["vm_stat"], capture_output=True, text=True, timeout=5)
|
||||
if vm.returncode == 0:
|
||||
page_size, pages = 4096, {}
|
||||
for line in vm.stdout.splitlines():
|
||||
if "page size of" in line:
|
||||
m = re.search(r"page size of (\d+)", line)
|
||||
if m:
|
||||
page_size = int(m.group(1))
|
||||
elif ":" in line:
|
||||
k, v = line.split(":", 1)
|
||||
v = v.strip().rstrip(".")
|
||||
if v.isdigit():
|
||||
pages[k.strip()] = int(v)
|
||||
free_pages = (pages.get("Pages free", 0) + pages.get("Pages inactive", 0)
|
||||
+ pages.get("Pages speculative", 0))
|
||||
if free_pages:
|
||||
free_mb = (free_pages * page_size) // (1024 * 1024)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return [{
|
||||
"index": 0,
|
||||
"name": name,
|
||||
"free_mb": free_mb,
|
||||
"total_mb": total_mb,
|
||||
"used_mb": max(0, total_mb - free_mb),
|
||||
"util_pct": 0,
|
||||
"uuid": "apple-metal-0",
|
||||
"unified_memory": True,
|
||||
"busy": (free_mb / total_mb) < 0.5 if total_mb else False,
|
||||
}]
|
||||
|
||||
|
||||
def cmd_gpus(args) -> None:
|
||||
"""Same shape the web UI gets — index/name/free_mb/total_mb/used_mb/
|
||||
util_pct/uuid. Returns `[]` with an `error` field if nvidia-smi is
|
||||
missing (laptop / CPU-only box). Pass `--host user@box` to run over
|
||||
SSH against a remote machine."""
|
||||
util_pct/uuid. On Apple Silicon (no nvidia-smi) reports the Metal GPU's
|
||||
unified memory instead. Returns `[]` with an `error` field only on a
|
||||
CPU-only non-Mac box. Pass `--host user@box` to run over SSH."""
|
||||
query = "nvidia-smi --query-gpu=index,name,memory.free,memory.total,memory.used,utilization.gpu,uuid --format=csv,noheader,nounits"
|
||||
prefix = _ssh_prefix(args.host, args.ssh_port)
|
||||
cmd = prefix + (query.split() if not prefix else [query])
|
||||
try:
|
||||
out = subprocess.run(cmd, capture_output=True, text=True, timeout=15)
|
||||
except FileNotFoundError:
|
||||
# No nvidia-smi locally → try the Metal fallback before giving up.
|
||||
if not prefix:
|
||||
mac = _macos_metal_gpu()
|
||||
if mac is not None:
|
||||
emit({"ok": True, "gpus": mac, "backend": "metal"}, args)
|
||||
return
|
||||
msg = "ssh not found" if prefix else "nvidia-smi not found"
|
||||
emit({"ok": False, "error": msg, "gpus": []}, args)
|
||||
return
|
||||
if out.returncode != 0:
|
||||
# nvidia-smi present but errored (or no NVIDIA GPU) — fall back to Metal.
|
||||
if not prefix:
|
||||
mac = _macos_metal_gpu()
|
||||
if mac is not None:
|
||||
emit({"ok": True, "gpus": mac, "backend": "metal"}, args)
|
||||
return
|
||||
emit({"ok": False, "error": out.stderr.strip()[:200], "gpus": []}, args)
|
||||
return
|
||||
gpus = []
|
||||
|
||||
@@ -171,6 +171,13 @@ export function _isWindows(hostOrTask) {
|
||||
return _getPlatform(hostOrTask) === 'windows';
|
||||
}
|
||||
|
||||
/** Check if the detected (local) hardware is Apple Silicon / Metal. Keys off the
|
||||
* hardware probe's backend rather than a platform string, since a local Mac
|
||||
* reports no platform but does report backend: "metal". */
|
||||
export function _isMetal() {
|
||||
return ['metal', 'mps', 'apple'].includes(String(_hwfitCache?.system?.backend || '').toLowerCase());
|
||||
}
|
||||
|
||||
/** Detect model-specific vLLM optimizations */
|
||||
function _detectModelOptimizations(modelName) {
|
||||
const n = (modelName || '').toLowerCase();
|
||||
@@ -252,6 +259,13 @@ export function _detectBackend(model) {
|
||||
return { backend: 'llamacpp', label: 'llama.cpp' };
|
||||
}
|
||||
|
||||
// Apple Silicon (Metal) → llama.cpp (GGUF). vLLM/SGLang are CUDA/ROCm-only and
|
||||
// don't run on macOS; AWQ/GPTQ/FP8 (vLLM-only) models are already filtered out
|
||||
// of metal Cookbook results, so llama.cpp is always the right engine here.
|
||||
if (['metal', 'mps', 'apple'].includes(sysBackend)) {
|
||||
return { backend: 'llamacpp', label: 'llama.cpp' };
|
||||
}
|
||||
|
||||
// AWQ / GPTQ / FP8 → vLLM
|
||||
if (/^AWQ|^GPTQ/.test(q) || q === 'FP8') {
|
||||
return { backend: 'vllm', label: 'vLLM' };
|
||||
@@ -1761,6 +1775,7 @@ const shared = {
|
||||
_sshPrefix,
|
||||
_getPlatform,
|
||||
_isWindows,
|
||||
_isMetal,
|
||||
_buildEnvPrefix,
|
||||
_buildServeCmd,
|
||||
_shellQuote,
|
||||
|
||||
@@ -16,6 +16,7 @@ let _getPort;
|
||||
let _sshPrefix;
|
||||
let _getPlatform;
|
||||
let _isWindows;
|
||||
let _isMetal;
|
||||
let _buildEnvPrefix;
|
||||
let _buildServeCmd;
|
||||
let _shellQuote;
|
||||
@@ -382,6 +383,8 @@ function _rerenderCachedModels() {
|
||||
panelHtml += `<div class="hwfit-serve-row">`;
|
||||
const _backendChoices = _isWindows()
|
||||
? [['llamacpp','llama.cpp']]
|
||||
: _isMetal()
|
||||
? [['llamacpp','llama.cpp'],['ollama','Ollama'],['diffusers','Diffusers']]
|
||||
: [['vllm','vLLM'],['sglang','SGLang'],['llamacpp','llama.cpp'],['diffusers','Diffusers']];
|
||||
const backendOpts = _backendChoices.map(([v,l]) => `<option value="${v}"${defaultBackend===v?' selected':''}>${l}</option>`).join('');
|
||||
panelHtml += `<label>${_l('Backend','Inference engine: vLLM, SGLang, llama.cpp, or Diffusers')}<select class="hwfit-sf" data-field="backend">${backendOpts}</select></label>`;
|
||||
@@ -1592,6 +1595,7 @@ export function initServe(shared) {
|
||||
_sshPrefix = shared._sshPrefix;
|
||||
_getPlatform = shared._getPlatform;
|
||||
_isWindows = shared._isWindows;
|
||||
_isMetal = shared._isMetal;
|
||||
_buildEnvPrefix = shared._buildEnvPrefix;
|
||||
_buildServeCmd = shared._buildServeCmd;
|
||||
_shellQuote = shared._shellQuote;
|
||||
|
||||
Reference in New Issue
Block a user