Generate macOS/Metal serve commands and surface the Metal GPU

cookbook_routes.py adds a macOS serve path (Ollama, Metal-aware llama.cpp build using `sysctl hw.ncpu` instead of `nproc`, and a clear error if vLLM is attempted). The frontend defaults Metal serving to llama.cpp and offers llama.cpp/Ollama instead of vLLM/SGLang. The odysseus-cookbook CLI's `gpus` command reports the Metal GPU via sysctl/vm_stat. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-17 02:05:22 -04:00 · 2026-05-31 20:24:38 -05:00
parent 32ac81dbc6
commit 4ba01ce25d
4 changed files with 122 additions and 7 deletions
@@ -171,6 +171,13 @@ export function _isWindows(hostOrTask) {
  return _getPlatform(hostOrTask) === 'windows';
 }

+/** Check if the detected (local) hardware is Apple Silicon / Metal. Keys off the
+ *  hardware probe's backend rather than a platform string, since a local Mac
+ *  reports no platform but does report backend: "metal". */
+export function _isMetal() {
+  return ['metal', 'mps', 'apple'].includes(String(_hwfitCache?.system?.backend || '').toLowerCase());
+}
+
 /** Detect model-specific vLLM optimizations */
 function _detectModelOptimizations(modelName) {
  const n = (modelName || '').toLowerCase();
@@ -252,6 +259,13 @@ export function _detectBackend(model) {
    return { backend: 'llamacpp', label: 'llama.cpp' };
  }

+  // Apple Silicon (Metal) → llama.cpp (GGUF). vLLM/SGLang are CUDA/ROCm-only and
+  // don't run on macOS; AWQ/GPTQ/FP8 (vLLM-only) models are already filtered out
+  // of metal Cookbook results, so llama.cpp is always the right engine here.
+  if (['metal', 'mps', 'apple'].includes(sysBackend)) {
+    return { backend: 'llamacpp', label: 'llama.cpp' };
+  }
+
  // AWQ / GPTQ / FP8 → vLLM
  if (/^AWQ|^GPTQ/.test(q) || q === 'FP8') {
    return { backend: 'vllm', label: 'vLLM' };
@@ -1761,6 +1775,7 @@ const shared = {
  _sshPrefix,
  _getPlatform,
  _isWindows,
+  _isMetal,
  _buildEnvPrefix,
  _buildServeCmd,
  _shellQuote,