Improve Cookbook serve diagnostics and recommendations

2026-06-17 02:05:22 -04:00 · 2026-06-02 12:15:41 +09:00
parent bdc99d746a
commit 966b53df77
14 changed files with 1113 additions and 191 deletions
@@ -962,13 +962,23 @@ def setup_cookbook_routes() -> APIRouter:
                # failed CUDA attempt) doesn't cause the next configure to reuse
                # stale settings and silently produce a CPU-only binary.
                runner_lines.append('    cd ~/llama.cpp && rm -rf build')
                runner_lines.append('    _ody_has_cuda_runtime=0')
                runner_lines.append('    if command -v nvcc &>/dev/null; then')
                runner_lines.append('      for _cudalib in "${CUDA_HOME:-}/lib64"/libcudart.so* "${CUDA_HOME:-}/lib"/libcudart.so* /usr/local/cuda/lib64/libcudart.so* /usr/lib*/libcudart.so*; do')
                runner_lines.append('        [ -e "$_cudalib" ] && _ody_has_cuda_runtime=1 && break')
                runner_lines.append('      done')
                runner_lines.append('    fi')
                runner_lines.append('    if command -v nvcc &>/dev/null && [ "$_ody_has_cuda_runtime" = "1" ]; then')
                runner_lines.append('      echo "[odysseus] CUDA nvcc found — building llama-server with CUDA (GPU) support..."')
                runner_lines.append('      cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON \\')
                runner_lines.append('        && cmake --build build -j"$NPROC" --target llama-server \\')
                runner_lines.append('        && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
                runner_lines.append('    else')
-                runner_lines.append('      echo "[odysseus] WARNING: nvcc not found — building llama-server for CPU only."')
+                runner_lines.append('      if command -v nvcc &>/dev/null; then')
                runner_lines.append('        echo "[odysseus] WARNING: nvcc found but CUDA runtime library was not found — building llama-server for CPU only."')
                runner_lines.append('      else')
                runner_lines.append('        echo "[odysseus] WARNING: nvcc not found — building llama-server for CPU only."')
                runner_lines.append('      fi')
                runner_lines.append('      echo "[odysseus]   GPU inference will not be available for this llama.cpp build."')
                runner_lines.append('      echo "[odysseus]   To get a GPU build, first install vLLM via Cookbook -> Dependencies"')
                runner_lines.append('      echo "[odysseus]   (its CUDA wheels include nvcc), then re-launch this serve task."')
@@ -982,6 +992,10 @@ def setup_cookbook_routes() -> APIRouter:
                runner_lines.append('    echo "llama-server build failed — installing Python bindings as fallback..."')
                runner_lines.append(f"    {_pip_install_fallback_chain('llama-cpp-python', python_cmd='pip')} || true")
                runner_lines.append('  fi')
                runner_lines.append('  if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then')
                runner_lines.append('    echo "ERROR: llama.cpp serving is not available after install/build attempts."')
                runner_lines.append('    ODYSSEUS_PREFLIGHT_EXIT=127')
                runner_lines.append('  fi')
                runner_lines.append('fi')
            elif "ollama" in req.cmd:
                handled_ollama_serve = True
@@ -1037,19 +1051,24 @@ def setup_cookbook_routes() -> APIRouter:
                # find the `vllm` CLI ("command not found"). Mirrors llama.cpp above.
                runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
                runner_lines.append('if ! command -v vllm &>/dev/null; then')
-                runner_lines.append('  echo "ERROR: vLLM is not installed. Open Cookbook -> Dependencies and install vllm on this server, then launch again."')
+                runner_lines.append('  echo "ERROR: vLLM is not installed."')
                runner_lines.append('  ODYSSEUS_PREFLIGHT_EXIT=127')
                runner_lines.append('fi')
            elif "sglang.launch_server" in req.cmd:
                runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
-                runner_lines.append('if ! python3 -c "import sglang" 2>/dev/null; then')
+                runner_lines.append('if ! command -v sglang &>/dev/null; then')
-                runner_lines.append('  echo "ERROR: SGLang is not installed. Open Cookbook -> Dependencies and install sglang on this server, then launch again."')
+                runner_lines.append('  echo "ERROR: SGLang is not installed."')
                runner_lines.append('  ODYSSEUS_PREFLIGHT_EXIT=127')
                runner_lines.append('elif ! ODYSSEUS_SGLANG_IMPORT_ERROR="$(python3 -c "import sglang" 2>&1)"; then')
                runner_lines.append('  echo "ERROR: SGLang is installed but failed to import."')
                runner_lines.append('  printf "%s\\n" "$ODYSSEUS_SGLANG_IMPORT_ERROR"')
                runner_lines.append('  ODYSSEUS_PREFLIGHT_EXIT=127')
                runner_lines.append('fi')
            elif "scripts/diffusion_server.py" in req.cmd or ".diffusion_server.py" in req.cmd:
                runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
-                runner_lines.append('if ! python3 -c "import torch, diffusers" 2>/dev/null; then')
+                runner_lines.append('if ! ODYSSEUS_DIFFUSION_IMPORT_ERROR="$(python3 -c "import torch, diffusers" 2>&1)"; then')
-                runner_lines.append('  echo "ERROR: Diffusion serving requires PyTorch + diffusers. Open Cookbook -> Dependencies and install diffusers on this server, then launch again."')
+                runner_lines.append('  echo "ERROR: Diffusion serving requires PyTorch + diffusers."')
                runner_lines.append('  printf "%s\\n" "$ODYSSEUS_DIFFUSION_IMPORT_ERROR"')
                runner_lines.append('  ODYSSEUS_PREFLIGHT_EXIT=127')
                runner_lines.append('fi')
@@ -88,6 +88,8 @@ def _quant_from_name(name):
        if "6bit" in n:
            return "mlx-6bit"
        return "mlx-8bit" if is8 else "mlx-4bit"
    if "nvfp4" in n:
        return "NVFP4"
    if "fp8" in n:
        return "FP8"
    if "int4" in n or "4bit" in n or "4-bit" in n:
@@ -136,7 +138,7 @@ def _entry_from_modelinfo(mi, overrides):
                params_by_dtype = getattr(st, "parameters", None) or {}
                if quant.endswith("4bit") or quant.endswith("Int4"):
                    pack_factor = 8
-                elif quant.endswith("8bit") or quant.endswith("Int8") or quant == "FP8":
+                elif quant.endswith("8bit") or quant.endswith("Int8") or quant in ("FP8", "NVFP4"):
                    pack_factor = 4
                else:
                    pack_factor = 1
@@ -158,7 +160,7 @@ def _entry_from_modelinfo(mi, overrides):
    rel = created.strftime("%Y-%m-%d") if created else datetime.utcnow().strftime("%Y-%m-%d")
    # Rough RAM/VRAM hints (fit.py recomputes the real requirement from params+quant).
    _BPP = {"AWQ-4bit": 0.58, "GPTQ-Int4": 0.58, "mlx-4bit": 0.55, "mlx-6bit": 0.85,
-            "AWQ-8bit": 1.1, "GPTQ-Int8": 1.1, "mlx-8bit": 1.1, "FP8": 1.1, "Q4_K_M": 0.6}
+            "AWQ-8bit": 1.1, "GPTQ-Int8": 1.1, "mlx-8bit": 1.1, "FP8": 1.1, "NVFP4": 0.6, "Q4_K_M": 0.6}
    bpp = _BPP.get(quant, 0.6)
    vram = round(pb * bpp + 0.5, 1)
    entry = {
@@ -13919,7 +13919,12 @@
  "architecture": "gemma4",
  "pipeline_tag": "image-text-to-text",
  "release_date": "2026-04-01",
-  "gguf_sources": [],
+  "gguf_sources": [
   {
    "repo": "unsloth/gemma-4-E2B-it-GGUF",
    "provider": "unsloth"
   }
  ],
  "capabilities": [
   "vision"
  ]
@@ -13942,7 +13947,12 @@
  "architecture": "gemma4",
  "pipeline_tag": "image-text-to-text",
  "release_date": "2026-04-01",
-  "gguf_sources": [],
+  "gguf_sources": [
   {
    "repo": "unsloth/gemma-4-E4B-it-GGUF",
    "provider": "unsloth"
   }
  ],
  "capabilities": [
   "vision"
  ]
@@ -13965,7 +13975,12 @@
  "architecture": "gemma4",
  "pipeline_tag": "image-text-to-text",
  "release_date": "2026-04-01",
-  "gguf_sources": [],
+  "gguf_sources": [
   {
    "repo": "unsloth/gemma-4-31B-it-GGUF",
    "provider": "unsloth"
   }
  ],
  "capabilities": [
   "vision"
  ]
@@ -13988,7 +14003,12 @@
  "architecture": "gemma4",
  "pipeline_tag": "image-text-to-text",
  "release_date": "2026-04-01",
-  "gguf_sources": [],
+  "gguf_sources": [
   {
    "repo": "unsloth/gemma-4-26B-A4B-it-GGUF",
    "provider": "unsloth"
   }
  ],
  "capabilities": [
   "vision"
  ]
@@ -18719,5 +18739,307 @@
  "hf_likes": 0,
  "release_date": "2026-04-19",
  "_discovered": true
 },
 {
  "name": "Qwen/Qwen3.6-27B-MTP",
  "provider": "Qwen",
  "parameter_count": "27.8B",
  "parameters_raw": 27781427952,
  "min_ram_gb": 16.6,
  "recommended_ram_gb": 21.6,
  "min_vram_gb": 16.6,
  "quantization": "Q4_K_M",
  "context_length": 262144,
  "use_case": "General purpose, coding, MTP",
  "is_moe": false,
  "num_experts": null,
  "active_experts": null,
  "active_parameters": null,
  "architecture": "qwen3",
  "pipeline_tag": "text-generation",
  "release_date": "2026-04-01",
  "gguf_sources": [
   {
    "repo": "unsloth/Qwen3.6-27B-MTP-GGUF",
    "provider": "unsloth"
   }
  ],
  "capabilities": [
   "mtp"
  ],
  "_discovered": true
 },
 {
  "name": "Qwen/Qwen3.6-35B-A3B-MTP",
  "provider": "Qwen",
  "parameter_count": "36.0B",
  "parameters_raw": 35951822704,
  "min_ram_gb": 21.4,
  "recommended_ram_gb": 27.8,
  "min_vram_gb": 21.4,
  "quantization": "Q4_K_M",
  "context_length": 262144,
  "use_case": "General purpose (MoE), MTP",
  "is_moe": true,
  "num_experts": null,
  "active_experts": null,
  "active_parameters": 3000000000,
  "architecture": "qwen3_moe",
  "pipeline_tag": "text-generation",
  "release_date": "2026-04-01",
  "gguf_sources": [
   {
    "repo": "unsloth/Qwen3.6-35B-A3B-MTP-GGUF",
    "provider": "unsloth"
   }
  ],
  "capabilities": [
   "mtp"
  ],
  "_discovered": true
 },
 {
  "name": "Qwen/Qwen3.5-0.8B-MTP",
  "provider": "Qwen",
  "parameter_count": "873M",
  "parameters_raw": 873438784,
  "min_ram_gb": 1.0,
  "recommended_ram_gb": 2.0,
  "min_vram_gb": 0.5,
  "quantization": "Q4_K_M",
  "context_length": 262144,
  "use_case": "General purpose, MTP",
  "capabilities": [
   "mtp",
   "tool_use",
   "vision"
  ],
  "pipeline_tag": "image-text-to-text",
  "architecture": "qwen3_5",
  "hf_downloads": 93448,
  "hf_likes": 208,
  "release_date": "2026-02-28",
  "gguf_sources": [
   {
    "repo": "unsloth/Qwen3.5-0.8B-MTP-GGUF",
    "provider": "unsloth"
   }
  ],
  "_discovered": true
 },
 {
  "name": "Qwen/Qwen3.5-2B-MTP",
  "provider": "Qwen",
  "parameter_count": "2.3B",
  "parameters_raw": 2274069824,
  "min_ram_gb": 1.3,
  "recommended_ram_gb": 2.1,
  "min_vram_gb": 1.2,
  "quantization": "Q4_K_M",
  "context_length": 262144,
  "use_case": "General purpose, MTP",
  "capabilities": [
   "mtp",
   "tool_use",
   "vision"
  ],
  "pipeline_tag": "image-text-to-text",
  "architecture": "qwen3_5",
  "hf_downloads": 46974,
  "hf_likes": 115,
  "release_date": "2026-02-28",
  "gguf_sources": [
   {
    "repo": "unsloth/Qwen3.5-2B-MTP-GGUF",
    "provider": "unsloth"
   }
  ],
  "_discovered": true
 },
 {
  "name": "Qwen/Qwen3.5-4B-MTP",
  "provider": "Qwen",
  "parameter_count": "4.7B",
  "parameters_raw": 4659865088,
  "min_ram_gb": 2.6,
  "recommended_ram_gb": 4.3,
  "min_vram_gb": 2.4,
  "quantization": "Q4_K_M",
  "context_length": 262144,
  "use_case": "General purpose, MTP",
  "capabilities": [
   "mtp",
   "tool_use",
   "vision"
  ],
  "pipeline_tag": "image-text-to-text",
  "architecture": "qwen3_5",
  "hf_downloads": 99087,
  "hf_likes": 202,
  "release_date": "2026-02-27",
  "gguf_sources": [
   {
    "repo": "unsloth/Qwen3.5-4B-MTP-GGUF",
    "provider": "unsloth"
   }
  ],
  "_discovered": true
 },
 {
  "name": "Qwen/Qwen3.5-9B-MTP",
  "provider": "Qwen",
  "parameter_count": "9.7B",
  "parameters_raw": 9653104368,
  "min_ram_gb": 5.4,
  "recommended_ram_gb": 9.0,
  "min_vram_gb": 4.9,
  "quantization": "Q4_K_M",
  "context_length": 262144,
  "use_case": "General purpose, MTP",
  "capabilities": [
   "mtp",
   "tool_use",
   "vision"
  ],
  "pipeline_tag": "image-text-to-text",
  "architecture": "qwen3_5",
  "hf_downloads": 172298,
  "hf_likes": 345,
  "release_date": "2026-02-27",
  "gguf_sources": [
   {
    "repo": "unsloth/Qwen3.5-9B-MTP-GGUF",
    "provider": "unsloth"
   }
  ],
  "_discovered": true
 },
 {
  "name": "Qwen/Qwen3.5-27B-MTP",
  "provider": "Qwen",
  "parameter_count": "27.8B",
  "parameters_raw": 27781427952,
  "min_ram_gb": 15.5,
  "recommended_ram_gb": 25.9,
  "min_vram_gb": 14.2,
  "quantization": "Q4_K_M",
  "context_length": 262144,
  "use_case": "General purpose, MTP",
  "capabilities": [
   "mtp",
   "tool_use",
   "vision"
  ],
  "pipeline_tag": "image-text-to-text",
  "architecture": "qwen3_5",
  "hf_downloads": 406808,
  "hf_likes": 565,
  "release_date": "2026-02-24",
  "gguf_sources": [
   {
    "repo": "unsloth/Qwen3.5-27B-MTP-GGUF",
    "provider": "unsloth"
   }
  ],
  "_discovered": true
 },
 {
  "name": "Qwen/Qwen3.5-35B-A3B-MTP",
  "provider": "Qwen",
  "parameter_count": "36.0B",
  "parameters_raw": 35951822704,
  "min_ram_gb": 20.1,
  "recommended_ram_gb": 33.5,
  "min_vram_gb": 18.4,
  "quantization": "Q4_K_M",
  "context_length": 262144,
  "use_case": "General purpose, MTP",
  "capabilities": [
   "mtp",
   "tool_use",
   "vision"
  ],
  "pipeline_tag": "image-text-to-text",
  "architecture": "qwen3_5_moe",
  "hf_downloads": 769032,
  "hf_likes": 905,
  "release_date": "2026-02-24",
  "is_moe": true,
  "num_experts": 256,
  "active_experts": 8,
  "active_parameters": 3000000000,
  "gguf_sources": [
   {
    "repo": "unsloth/Qwen3.5-35B-A3B-MTP-GGUF",
    "provider": "unsloth"
   }
  ],
  "_discovered": true
 },
 {
  "name": "Qwen/Qwen3.5-122B-A10B-MTP",
  "provider": "Qwen",
  "parameter_count": "125.1B",
  "parameters_raw": 125086497008,
  "min_ram_gb": 69.9,
  "recommended_ram_gb": 116.5,
  "min_vram_gb": 64.1,
  "quantization": "Q4_K_M",
  "context_length": 262144,
  "use_case": "General purpose, MTP",
  "capabilities": [
   "mtp",
   "tool_use",
   "vision"
  ],
  "pipeline_tag": "image-text-to-text",
  "architecture": "qwen3_5_moe",
  "hf_downloads": 171055,
  "hf_likes": 389,
  "release_date": "2026-02-24",
  "is_moe": true,
  "num_experts": 256,
  "active_experts": 8,
  "active_parameters": 10000000000,
  "gguf_sources": [
   {
    "repo": "unsloth/Qwen3.5-122B-A10B-MTP-GGUF",
    "provider": "unsloth"
   }
  ],
  "_discovered": true
 },
 {
  "name": "Qwen/Qwen3.5-397B-A17B-MTP",
  "provider": "Qwen",
  "parameter_count": "403.4B",
  "parameters_raw": 403397928944,
  "min_ram_gb": 225.4,
  "recommended_ram_gb": 375.7,
  "min_vram_gb": 206.6,
  "quantization": "Q4_K_M",
  "context_length": 262144,
  "use_case": "General purpose, MTP",
  "capabilities": [
   "mtp",
   "tool_use",
   "vision"
  ],
  "pipeline_tag": "image-text-to-text",
  "architecture": "qwen3_5_moe",
  "hf_downloads": 1291825,
  "hf_likes": 1214,
  "release_date": "2026-02-16",
  "is_moe": true,
  "num_experts": 256,
  "active_experts": 8,
  "active_parameters": 17000000000,
  "gguf_sources": [
   {
    "repo": "unsloth/Qwen3.5-397B-A17B-MTP-GGUF",
    "provider": "unsloth"
   }
  ],
  "_discovered": true
 }
 ]
@@ -99,6 +99,27 @@ def _estimate_speed(model, quant, run_mode, system):
    return k / pb * sm
 def _architecture_bonus(model):
    name = (model.get("name") or "").lower()
    arch = (model.get("architecture") or "").lower()
    text = f"{name} {arch}"
    # Keep this intentionally small: hardware fit and speed still matter, but
    # current model families should not be scored the same as older Qwen2/LLama
    # era entries just because the parameter count is similar.
    if "qwen3.6" in text or "qwen3_6" in text:
        return 9
    if "qwen3.5" in text or "qwen3_5" in text:
        return 8
    if "qwen3-next" in text or "qwen3_next" in text:
        return 6
    if "qwen3" in text or arch.startswith("qwen3"):
        return 4
    if "qwen2.5" in text or "qwen2_5" in text:
        return 2
    return 0
 def _quality_score(model, quant, use_case):
    pb = params_b(model)
    if pb < 1:
@@ -128,6 +149,7 @@ def _quality_score(model, quant, use_case):
    if "gemma" in name_lower:
        base += 1
    base += _architecture_bonus(model)
    base += QUANT_QUALITY_PENALTY.get(quant, 0)
    model_uc = infer_use_case(model)
@@ -220,12 +242,13 @@ def _quant_bits(q):
    return 0
-def analyze_model(model, system, target_quant=None):
+def analyze_model(model, system, target_quant=None, scoring_use_case=None):
    pb = params_b(model)
    if pb <= 0:
        return None
-    use_case = infer_use_case(model)
+    model_use_case = infer_use_case(model)
    score_use_case = scoring_use_case or "general"
    has_gpu = system.get("has_gpu", False)
    gpu_vram = (system.get("gpu_vram_gb") or 0) if has_gpu else 0
    gpu_count = system.get("gpu_count", 1) or 1
@@ -242,6 +265,8 @@ def analyze_model(model, system, target_quant=None):
    ctx = model.get("context_length", 4096) or 4096
    native_quant = model.get("quantization", "Q4_K_M")
    if "nvfp4" in (model.get("name") or "").lower():
        native_quant = "NVFP4"
    preq = is_prequantized(model)
    # GGUF models can't be sharded across GPUs — use single GPU VRAM
@@ -260,10 +285,13 @@ def analyze_model(model, system, target_quant=None):
    # Determine which quant to evaluate at
    if preq:
        # AWQ/GPTQ/FP8/MLX come at a fixed bit-width. If the user picked a
-        # specific quant tier (e.g. Q8 → 8-bit), only keep prequant models whose
+        # GGUF quant tier (Q4/Q8/etc.), do not treat a same-bit AWQ/GPTQ build
-        # native bit-width matches — otherwise selecting Q8 would still surface
+        # as equivalent. "Q4" means llama.cpp/Ollama-style GGUF in this UI;
-        # AWQ-4bit models, mixing 4- and 8-bit in one view.
+        # AWQ/GPTQ/FP8 are separate GPU-serving formats and must only appear
        # when explicitly selected or when no quant filter is applied.
        if target_quant:
            if not any(target_quant.startswith(p) for p in ("AWQ-", "GPTQ-", "FP8", "NVFP4")):
                return None
            _tb, _nb = _quant_bits(target_quant), _quant_bits(native_quant)
            if _tb and _nb and _tb != _nb:
                return None
@@ -300,7 +328,7 @@ def analyze_model(model, system, target_quant=None):
            "parameter_count": model.get("parameter_count"),
            "params_b": round(pb, 1),
            "is_moe": is_moe,
-            "use_case": use_case,
+            "use_case": model_use_case,
            "fit_level": "too_tight",
            "run_mode": "no_fit",
            "quant": quant_to_try,
@@ -334,12 +362,12 @@ def analyze_model(model, system, target_quant=None):
    tps = _estimate_speed(model, quant, run_mode, system)
-    q_score = _quality_score(model, quant, use_case)
+    q_score = _quality_score(model, quant, score_use_case)
-    s_score = _speed_score(tps, use_case)
+    s_score = _speed_score(tps, score_use_case)
    f_score = _fit_score(required_gb, budget)
-    c_score = _context_score(fit_ctx, use_case)
+    c_score = _context_score(fit_ctx, score_use_case)
-    wq, ws, wf, wc = USE_CASE_WEIGHTS.get(use_case, (0.45, 0.30, 0.15, 0.10))
+    wq, ws, wf, wc = USE_CASE_WEIGHTS.get(score_use_case, (0.45, 0.30, 0.15, 0.10))
    composite = q_score * wq + s_score * ws + f_score * wf + c_score * wc
    return {
@@ -348,7 +376,7 @@ def analyze_model(model, system, target_quant=None):
        "parameter_count": model.get("parameter_count"),
        "params_b": round(pb, 1),
        "is_moe": is_moe,
-        "use_case": use_case,
+        "use_case": model_use_case,
        "fit_level": fit_level,
        "run_mode": run_mode,
        "quant": quant,
@@ -419,21 +447,29 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
            results.sort(key=sort_fn, reverse=(sort != "vram"))
            return results[:limit]
-    # If user picked a prequantized format (AWQ/FP8/GPTQ), filter to only those models
+    # If user picked a prequantized format (AWQ/FP8/GPTQ/NVFP4), filter to only those models
-    filter_native = quant and any(quant.startswith(p) for p in ("AWQ-", "GPTQ-", "FP8"))
+    filter_native = quant and any(quant.startswith(p) for p in ("AWQ-", "GPTQ-", "FP8", "NVFP4"))
    system_backend = (system.get("backend") or "").lower()
    apple_silicon = system_backend in ("mps", "metal", "apple")
    rocm = system_backend == "rocm"
    for m in models:
        native_q = m.get("quantization", "")
        if "nvfp4" in (m.get("name") or "").lower():
            native_q = "NVFP4"
-        # MLX-quantized models need the MLX runtime (mlx_lm), which Odysseus
+        # MLX is Apple Silicon only. Hide MLX rows on non-Mac hardware scans,
-        # doesn't generate serve commands for — only llama.cpp/Ollama (Metal)
+        # but leave them visible on Metal/MPS so Mac support is not broken.
-        # and vLLM/SGLang (CUDA). MLX repos ship no GGUF alternative, so they're
+        if not apple_silicon and (native_q.startswith("mlx-") or "mlx" in (m.get("name") or "").lower()):
-        # unrunnable on every backend we support. Always drop them, on Apple
+            continue
-        # Silicon too, so the Cookbook never recommends a model it can't serve.
+
-        if native_q.startswith("mlx-"):
+        # ROCm support for vLLM/SGLang quantized safetensors is too brittle to
        # recommend blindly in the default scan. Keep AWQ/GPTQ/FP8 discoverable
        # only when the user explicitly picks that format from the quant filter;
        # otherwise prefer GGUF/Q* entries that Odysseus can route through
        # llama.cpp/Ollama without pretending "fits VRAM" means "servable".
        if rocm and is_prequantized(m) and not filter_native:
            continue
        # On Apple Silicon the only serving engines are llama.cpp and Ollama,
@@ -443,7 +479,8 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
        # default GGUF quant) and vLLM-only AWQ/GPTQ/FP8 builds alike. Without
        # this the Cookbook recommends models the Mac can't run; on CUDA these
        # stay visible because vLLM serves safetensors directly.
-        if apple_silicon and not (m.get("is_gguf") or m.get("gguf_sources")):
+        is_mlx = native_q.startswith("mlx-") or "mlx" in (m.get("name") or "").lower()
        if apple_silicon and not (m.get("is_gguf") or m.get("gguf_sources") or is_mlx):
            continue
        # Format filter: AWQ tab → only AWQ models, FP8 tab → only FP8 models
@@ -454,6 +491,8 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
                continue
            if quant.startswith("GPTQ") and not native_q.startswith("GPTQ"):
                continue
            if quant.startswith("NVFP4") and not native_q.startswith("NVFP4"):
                continue
        if search:
            name = m.get("name", "").lower()
@@ -461,7 +500,7 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
            if search.lower() not in name and search.lower() not in provider:
                continue
-        result = analyze_model(m, system, target_quant=quant)
+        result = analyze_model(m, system, target_quant=quant, scoring_use_case=(use_case or "general"))
        if result is None:
            continue
@@ -5,7 +5,7 @@ import re
 QUANT_HIERARCHY = ["Q8_0", "Q6_K", "Q5_K_M", "Q4_K_M", "Q3_K_M", "Q2_K"]
 QUANT_BPP = {
-    "F32": 4.0, "F16": 2.0, "BF16": 2.0, "FP8": 1.0,
+    "F32": 4.0, "F16": 2.0, "BF16": 2.0, "FP8": 1.0, "NVFP4": 0.5,
    "Q8_0": 1.05, "Q6_K": 0.80, "Q5_K_M": 0.68,
    "Q4_K_M": 0.58, "Q4_0": 0.58, "Q3_K_M": 0.48, "Q2_K": 0.37,
    "AWQ-4bit": 0.50, "AWQ-8bit": 1.0,
@@ -14,7 +14,7 @@ QUANT_BPP = {
 }
 QUANT_SPEED_MULT = {
-    "F16": 0.6, "BF16": 0.6, "FP8": 0.85,
+    "F16": 0.6, "BF16": 0.6, "FP8": 0.85, "NVFP4": 1.1,
    "Q8_0": 0.8, "Q6_K": 0.95, "Q5_K_M": 1.0,
    "Q4_K_M": 1.15, "Q4_0": 1.15, "Q3_K_M": 1.25, "Q2_K": 1.35,
    "AWQ-4bit": 1.2, "AWQ-8bit": 0.85,
@@ -23,7 +23,7 @@ QUANT_SPEED_MULT = {
 }
 QUANT_QUALITY_PENALTY = {
-    "F16": 0.0, "BF16": 0.0, "FP8": 0.0,
+    "F16": 0.0, "BF16": 0.0, "FP8": 0.0, "NVFP4": 0.0,
    "Q8_0": 0.0, "Q6_K": -1.0, "Q5_K_M": -2.0,
    "Q4_K_M": -5.0, "Q4_0": -5.0, "Q3_K_M": -8.0, "Q2_K": -12.0,
    "AWQ-4bit": -3.0, "AWQ-8bit": 0.0,
@@ -32,7 +32,7 @@ QUANT_QUALITY_PENALTY = {
 }
 QUANT_BYTES_PER_PARAM = {
-    "F16": 2.0, "BF16": 2.0, "FP8": 1.0,
+    "F16": 2.0, "BF16": 2.0, "FP8": 1.0, "NVFP4": 0.5,
    "Q8_0": 1.0, "Q6_K": 0.75, "Q5_K_M": 0.625,
    "Q4_K_M": 0.5, "Q4_0": 0.5, "Q3_K_M": 0.375, "Q2_K": 0.25,
    "AWQ-4bit": 0.5, "AWQ-8bit": 1.0,
@@ -41,12 +41,13 @@ QUANT_BYTES_PER_PARAM = {
 }
 # Pre-quantized formats that should NOT go through the GGUF quant hierarchy
-PREQUANTIZED_PREFIXES = ("AWQ-", "GPTQ-", "mlx-", "FP8")
+PREQUANTIZED_PREFIXES = ("AWQ-", "GPTQ-", "mlx-", "FP8", "NVFP4")
 def is_prequantized(model):
    q = model.get("quantization", "")
-    return any(q.startswith(p) for p in PREQUANTIZED_PREFIXES)
+    name = (model.get("name") or "").lower()
    return "nvfp4" in name or any(q.startswith(p) for p in PREQUANTIZED_PREFIXES)
 def params_b(model):
@@ -502,6 +502,11 @@ async def _direct_fallback(
                )
            except asyncio.TimeoutError:
                return {"error": f"web_fetch: timed out fetching {url}", "exit_code": 1}
            except Exception as e:
                # Direct URL fetches can hit bot protection / auth walls
                # (e.g. eBay 403). Treat that as a tool failure the model can
                # reason around, not an uncaught chat-stream 500.
                return {"error": f"web_fetch: {url}: {e}", "exit_code": 1}
            err = result.get("error")
            text = (result.get("content") or "").strip()
            title = result.get("title") or ""
@@ -27,6 +27,56 @@ import spinnerModule from './spinner.js';
 // ── Error diagnosis ──
 function _openCookbookDependencies(pkgName = '') {
  const cookbook = window.cookbookModule;
  if (cookbook && typeof cookbook.open === 'function') {
    cookbook.open({ tab: 'Dependencies' });
  } else {
    document.getElementById('tool-cookbook-btn')?.click();
  }
  const wanted = String(pkgName || '').toLowerCase();
  const tryHighlight = (attempt = 0) => {
    const modal = document.getElementById('cookbook-modal');
    const tab = modal?.querySelector('.cookbook-tab[data-backend="Dependencies"]');
    if (tab && !tab.classList.contains('active')) tab.click();
    const rows = [...document.querySelectorAll('#cookbook-deps-list [data-pkg-name]')];
    if (!rows.length) {
      if (attempt < 45) setTimeout(() => tryHighlight(attempt + 1), 100);
      return;
    }
    if (!wanted) return;
    const row = rows.find(r => {
      const name = (r.dataset.pkgName || '').toLowerCase();
      const pip = (r.dataset.depPip || '').toLowerCase();
      return name === wanted || pip.includes(wanted) || wanted.includes(name);
    });
    if (row) {
      row.scrollIntoView({ block: 'center' });
      row.classList.add('cookbook-pkg-flash');
      setTimeout(() => row.classList.remove('cookbook-pkg-flash'), 1800);
    }
  };
  tryHighlight();
 }
 function _openServeEditFromDiagnosis(panel, fields = null) {
  const task = panel?.closest?.('.cookbook-task');
  if (!task) return;
  task.dispatchEvent(new CustomEvent('cookbook:edit-serve', { bubbles: true, detail: { fields } }));
 }
 function _openCpuServeEdit(panel) {
  _openServeEditFromDiagnosis(panel, {
    backend: 'llamacpp',
    gpus: '',
    tp: '1',
    gpu_mem: '0.80',
    _forceBackend: true,
  });
 }
 // Infer the gated base repo that single-file checkpoints need configs from
 function _inferBaseRepo(text) {
  if (!text) return null;
@@ -218,6 +268,7 @@ export const ERROR_PATTERNS = [
    pattern: /vllm.*command not found|No module named vllm/i,
    message: 'vLLM is not installed or not in PATH.',
    fixes: [
      { label: 'Open Dependencies', action: () => _openCookbookDependencies('vllm') },
      { label: 'Check environment is set', action: (panel) => {
        const el = panel.querySelector('[data-field="env_type"]');
        if (el) { el.focus(); el.style.borderColor = 'var(--red)'; }
@@ -226,11 +277,21 @@ export const ERROR_PATTERNS = [
  },
  {
    pattern: /sglang.*command not found|No module named sglang|SGLang is not installed/i,
-    message: 'SGLang is not installed or not in PATH. Open Cookbook → Dependencies and install sglang on this server.',
+    message: 'SGLang is not installed or not in PATH.',
    fixes: [
      { label: 'Open Dependencies', action: () => _openCookbookDependencies('sglang') },
      { label: 'Copy install command', action: () => _copyText('python3 -m pip install "sglang[all]"') },
    ],
  },
  {
    pattern: /No accelerator \(CUDA, XPU, HPU, NPU, MUSA, MPS\) is available|Triton is not supported on current platform/i,
    message: 'SGLang needs a visible GPU/accelerator on this server.',
    suggestion: 'Suggested action: switch this serve config to llama.cpp for CPU/local serving, or choose a GPU server.',
    fixes: [
      { label: 'Switch to llama.cpp', action: (panel) => _openCpuServeEdit(panel) },
      { label: 'Choose GPU server', action: (panel) => _openServeEditFromDiagnosis(panel) },
    ],
  },
  {
    pattern: /flashinfer.*version.*does not match|flashinfer-cubin version/i,
    message: 'FlashInfer version mismatch.',
@@ -241,8 +302,12 @@ export const ERROR_PATTERNS = [
  },
  {
    pattern: /torch\.cuda\.is_available\(\).*False|No CUDA runtime/i,
-    message: 'CUDA not available in this environment.',
+    message: 'vLLM needs a visible CUDA/ROCm GPU.',
-    fixes: [],
+    suggestion: 'Suggested action: switch this serve config to llama.cpp for CPU/local serving, or choose a GPU server.',
    fixes: [
      { label: 'Switch to llama.cpp', action: (panel) => _openCpuServeEdit(panel) },
      { label: 'Choose GPU server', action: (panel) => _openServeEditFromDiagnosis(panel) },
    ],
  },
  {
    pattern: /Engine core initialization failed/i,
@@ -295,17 +360,20 @@ export const ERROR_PATTERNS = [
  },
  {
    pattern: /Either a revision or a version must be specified|transformers\.integrations\.hub_kernels|kernels\/layer/i,
-    message: 'vLLM/Transformers kernel package mismatch.',
+    message: 'Transformers/kernels package mismatch.',
    fixes: [
-      { label: 'Update vLLM/Transformers/kernels', action: (panel) => {
+      { label: 'Repair kernel package', action: (panel) => {
        const taskEl = panel.closest('.cookbook-task');
        const task = taskEl ? _loadTasks().find(t => t.sessionId === taskEl.dataset.taskId) : null;
        const host = task?.remoteHost || '';
        const prefix = _buildEnvPrefix();
-        const pipCmd = prefix ? prefix + ' python3 -m pip install -U vllm transformers kernels' : 'python3 -m pip install -U vllm transformers kernels';
+        const pipCmd = prefix
          ? prefix + ' python3 -m pip install --user --break-system-packages "kernels<0.15"'
          : 'python3 -m pip install --user --break-system-packages "kernels<0.15"';
        const cmd = host ? _sshCmd(host, pipCmd) : pipCmd;
-        _launchServeTask('update-vllm-stack', 'pip-update', cmd);
+        _launchServeTask('repair-kernels', 'pip-update', cmd);
      }},
      { label: 'Open Dependencies', action: () => _openCookbookDependencies('sglang') },
    ],
  },
  {
@@ -319,13 +387,24 @@ export const ERROR_PATTERNS = [
    pattern: /llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'/i,
    message: 'llama-cpp-python server is not installed. Run: pip install "llama-cpp-python[server]"',
    fixes: [
      { label: 'Open Dependencies', action: () => _openCookbookDependencies('llama_cpp') },
      { label: 'Copy install command', action: () => _copyText('pip install "llama-cpp-python[server]"') },
    ],
  },
  {
    pattern: /CUDA Toolkit not found|Unable to find cudart library|missing:\s*CUDA_CUDART/i,
    message: 'llama.cpp found nvcc, but the CUDA runtime library is missing.',
    suggestion: 'Suggested action: relaunch with the updated runner so llama.cpp builds CPU-only, or install a complete CUDA toolkit/runtime on this server for GPU llama.cpp.',
    fixes: [
      { label: 'Edit serve', action: (panel) => _openServeEditFromDiagnosis(panel) },
      { label: 'Open Dependencies', action: () => _openCookbookDependencies('llama_cpp') },
    ],
  },
  {
    pattern: /No module named ['"]?torch|No module named ['"]?diffusers|diffusers.*command not found/i,
    message: 'Diffusion serving needs PyTorch and diffusers. Install diffusers from Cookbook → Dependencies.',
    fixes: [
      { label: 'Open Dependencies', action: () => _openCookbookDependencies('diffusers') },
      { label: 'Copy install command', action: () => _copyText('python3 -m pip install "diffusers[torch]"') },
    ],
  },
@@ -402,10 +481,32 @@ export function _diagnose(text) {
  return null;
 }
 function _diagnosisCopyBundle(task, diagnosis, sourceText, suggestionText) {
  const lines = ['## Odysseus Cookbook troubleshooting'];
  if (task) {
    lines.push(
      '',
      '### Task',
      `- ID: ${task.sessionId || task.id || 'unknown'}`,
      `- Type: ${task.type || 'unknown'}`,
      `- Status: ${task.status || 'unknown'}`,
      `- Model: ${task.payload?.repo_id || task.name || 'unknown'}`,
      `- Host: ${task.remoteHost || 'local'}${task.sshPort ? `:${task.sshPort}` : ''}`,
    );
  }
  lines.push('', '### Diagnosis', diagnosis?.message || '(none)');
  if (suggestionText) lines.push('', '### Suggested action', suggestionText.replace(/^Suggested action:\s*/i, ''));
  const cmd = task?.payload?._cmd || '';
  if (cmd) lines.push('', '### Launch command', '```bash', cmd, '```');
  if (sourceText) lines.push('', '### Captured output', '```text', String(sourceText).trim(), '```');
  return lines.join('\n');
 }
 export function _showDiagnosis(panel, diagnosis, sourceText) {
-  if (panel._lastDiagMsg === diagnosis.message) return;
+  const wasCollapsed = panel._lastDiagMsg === diagnosis.message && panel._diagCollapsed;
-  if (panel._diagDismissed === diagnosis.message) return; // stay dismissed until new error
+  if (panel._diagDismissed === diagnosis.message) return;
  panel._lastDiagMsg = diagnosis.message;
  panel._diagCollapsed = !!wasCollapsed;
  let diag = panel.querySelector('.cookbook-diagnosis');
  if (!diag) {
@@ -417,57 +518,161 @@ export function _showDiagnosis(panel, diagnosis, sourceText) {
  }
  diag.classList.remove('hidden');
  diag.innerHTML = '';
  const taskEl = panel?.closest?.('.cookbook-task');
  const task = taskEl ? _loadTasks().find(t => t.sessionId === taskEl.dataset.taskId) : null;
  const fixes = [...(diagnosis.fixes || [])];
  if (task?.type === 'serve' && task.payload?._cmd && !fixes.some(f => f.label === 'Edit serve')) {
    fixes.push({ label: 'Edit serve', action: (p) => _openServeEditFromDiagnosis(p) });
  }
  const suggestionText = diagnosis.suggestion || (fixes.length
    ? `Suggested action: ${fixes[0].label}.`
    : 'Suggested action: copy the error and adjust the serve settings.');
  const header = document.createElement('div');
-  header.style.cssText = 'display:flex;align-items:center;justify-content:space-between;';
+  header.className = 'cookbook-diag-header';
-  const msg = document.createElement('div');
+  const fold = document.createElement('button');
-  msg.className = 'cookbook-diag-message';
+  fold.className = 'cookbook-diag-fold';
-  msg.textContent = diagnosis.message;
+  fold.type = 'button';
-  header.appendChild(msg);
+  fold.innerHTML = '<span class="cookbook-diag-chevron">▾</span><span>Error message:</span>';
  header.appendChild(fold);
  const copy = document.createElement('button');
  copy.className = 'cookbook-diag-copy';
  copy.type = 'button';
  copy.title = 'Copy troubleshooting bundle';
  copy.setAttribute('aria-label', 'Copy troubleshooting bundle');
  copy.innerHTML = '<svg width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.2" stroke-linecap="round" stroke-linejoin="round"><rect x="9" y="9" width="13" height="13" rx="2"/><path d="M5 15H4a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h9a2 2 0 0 1 2 2v1"/></svg>';
  copy.addEventListener('click', (e) => {
    e.stopPropagation();
    _copyText(_diagnosisCopyBundle(task, diagnosis, sourceText, suggestionText));
    copy.classList.add('copied');
    copy.innerHTML = '<svg width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.6" stroke-linecap="round" stroke-linejoin="round"><polyline points="20 6 9 17 4 12"/></svg>';
    setTimeout(() => {
      if (!copy.isConnected) return;
      copy.classList.remove('copied');
      copy.innerHTML = '<svg width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.2" stroke-linecap="round" stroke-linejoin="round"><rect x="9" y="9" width="13" height="13" rx="2"/><path d="M5 15H4a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h9a2 2 0 0 1 2 2v1"/></svg>';
    }, 1200);
  });
  header.appendChild(copy);
  const dismiss = document.createElement('button');
-  dismiss.className = 'close-btn';
+  dismiss.className = 'cookbook-diag-dismiss';
-  dismiss.style.cssText = 'width:16px;height:16px;font-size:9px;flex-shrink:0;';
+  dismiss.type = 'button';
-  dismiss.textContent = '\u2715';
+  dismiss.title = 'Dismiss error';
-  dismiss.addEventListener('click', () => { panel._diagDismissed = diagnosis.message; _clearDiagnosis(panel); });
+  dismiss.setAttribute('aria-label', 'Dismiss error');
  dismiss.textContent = '×';
  dismiss.addEventListener('click', (e) => {
    e.stopPropagation();
    panel._diagDismissed = diagnosis.message;
    _clearDiagnosis(panel);
  });
  header.appendChild(dismiss);
  diag.appendChild(header);
-  if (diagnosis.fixes && diagnosis.fixes.length) {
+  const body = document.createElement('div');
  body.className = 'cookbook-diag-body';
  body.classList.toggle('hidden', panel._diagCollapsed);
  fold.querySelector('.cookbook-diag-chevron').textContent = panel._diagCollapsed ? '▸' : '▾';
  const msg = document.createElement('div');
  msg.className = 'cookbook-diag-message';
  msg.textContent = diagnosis.message;
  body.appendChild(msg);
  const suggestion = document.createElement('div');
  suggestion.className = 'cookbook-diag-suggestion';
  suggestion.textContent = suggestionText;
  body.appendChild(suggestion);
  fold.addEventListener('click', (e) => {
    e.stopPropagation();
    panel._diagCollapsed = !panel._diagCollapsed;
    body.classList.toggle('hidden', panel._diagCollapsed);
    fold.querySelector('.cookbook-diag-chevron').textContent = panel._diagCollapsed ? '▸' : '▾';
  });
  diag.appendChild(body);
  const runFix = async (fix, button, busyLabel = fix.label, onStart = null, onDone = null) => {
    if (!fix || !button || button.dataset.busy) return;
    button.dataset.busy = '1';
    const _orig = button.textContent;
    const wp = spinnerModule.createWhirlpool(12);
    wp.element.style.cssText = 'display:inline-block;vertical-align:middle;width:12px;height:12px;margin-right:5px;';
    button.textContent = '';
    button.appendChild(wp.element);
    const _lbl = document.createElement('span');
    _lbl.textContent = busyLabel;
    _lbl.style.verticalAlign = 'middle';
    button.appendChild(_lbl);
    try {
      if (typeof onStart === 'function') onStart();
      await fix.action(panel, sourceText);
    } catch (err) {
      console.error('[cookbook] diagnosis fix failed', err);
    } finally {
      if (button.isConnected) {
        try { wp.destroy(); } catch {}
        button.textContent = _orig;
        delete button.dataset.busy;
      }
      if (typeof onDone === 'function') onDone();
    }
  };
  if (fixes.length) {
    const row = document.createElement('div');
    row.className = 'cookbook-diag-fixes';
-    for (const fix of diagnosis.fixes) {
+
-      const btn = document.createElement('button');
+    if (fixes.length <= 3) {
-      btn.className = 'cookbook-btn cookbook-diag-btn';
+      for (const fix of fixes) {
-      btn.textContent = fix.label;
+        const btn = document.createElement('button');
-      btn.addEventListener('click', async () => {
+        btn.className = 'cookbook-btn cookbook-diag-btn';
-        if (btn.dataset.busy) return;
+        btn.type = 'button';
-        btn.dataset.busy = '1';
+        btn.textContent = fix.label;
-        // Spinner feedback while the fix runs (kill + relaunch takes a moment).
+        btn.addEventListener('click', (e) => {
-        const _orig = btn.textContent;
+          e.stopPropagation();
-        const wp = spinnerModule.createWhirlpool(12);
+          runFix(fix, btn);
-        wp.element.style.cssText = 'display:inline-block;vertical-align:middle;width:12px;height:12px;margin-right:5px;';
+        });
-        btn.textContent = '';
+        row.appendChild(btn);
-        btn.appendChild(wp.element);
+      }
-        const _lbl = document.createElement('span');
+      body.appendChild(row);
-        _lbl.textContent = _orig;
+      return;
        _lbl.style.verticalAlign = 'middle';
        btn.appendChild(_lbl);
        try {
          await fix.action(panel, sourceText);
        } catch (e) {
          console.error('[cookbook] diagnosis fix failed', e);
        } finally {
          // Retries animate the whole card away (button goes with it). For fixes
          // that leave the card in place, restore the label.
          if (btn.isConnected) { try { wp.destroy(); } catch {} btn.textContent = _orig; delete btn.dataset.busy; }
        }
      });
      row.appendChild(btn);
    }
-    diag.appendChild(row);
+
    const wrap = document.createElement('div');
    wrap.className = 'cookbook-diag-actions';
    const trigger = document.createElement('button');
    trigger.className = 'cookbook-btn cookbook-diag-action-trigger';
    trigger.type = 'button';
    trigger.textContent = 'Actions';
    trigger.appendChild(document.createTextNode(' ▾'));
    wrap.appendChild(trigger);
    const menu = document.createElement('div');
    menu.className = 'dropdown cookbook-diag-menu hidden';
    for (const fix of fixes) {
      const item = document.createElement('button');
      item.type = 'button';
      item.textContent = fix.label;
      item.addEventListener('click', async (e) => {
        e.stopPropagation();
        if (item.dataset.busy || trigger.dataset.busy) return;
        item.dataset.busy = '1';
        await runFix(fix, trigger, fix.label, () => menu.classList.add('hidden'), () => delete item.dataset.busy);
      });
      menu.appendChild(item);
    }
    wrap.appendChild(menu);
    trigger.addEventListener('click', (e) => {
      e.stopPropagation();
      if (trigger.dataset.busy) return;
      document.querySelectorAll('.cookbook-diag-menu').forEach(m => {
        if (m !== menu) m.classList.add('hidden');
      });
      menu.classList.toggle('hidden');
    });
    row.appendChild(wrap);
    body.appendChild(row);
  }
 }
@@ -193,6 +193,8 @@ export function _renderGpuToggles(system) {
        if (quantSel) {
          if (count <= 1) {
            quantSel.value = 'Q4_K_M'; // RAM or 1 GPU -> Q4 sweet spot
          } else if (String(system?.backend || '').toLowerCase() === 'rocm') {
            quantSel.value = 'Q4_K_M'; // ROCm default stays GGUF/local-safe; AWQ is explicit only
          } else {
            quantSel.value = 'AWQ-4bit'; // Multi-GPU -> AWQ for vLLM
          }
@@ -260,12 +260,31 @@ export function _detectBackend(model) {
  const q = (model.quant || '').toUpperCase();
  const sysBackend = String(_hwfitCache?.system?.backend || '').toLowerCase();
  const isRocm = sysBackend === 'rocm';
  const isAppleSilicon = ['metal', 'mps', 'apple'].includes(sysBackend);
  const _nm = `${model.repo_id || ''} ${model.path || ''} ${model.name || ''}`.toLowerCase();
  if (!isAppleSilicon && (/\bmlx\b|mlx-|_mlx/i.test(_nm) || q.startsWith('MLX'))) {
    return { backend: 'unsupported', label: 'Unsupported' };
  }
  const isAwqLike = /^AWQ|^GPTQ|^NVFP4/.test(q) || q === 'FP8' || /\b(awq|gptq|fp8|nvfp4)\b/i.test(_nm);
  const isGgufLike = model.is_gguf || /^Q[2-8]/.test(q) || /^IQ/.test(q) || q === 'GGUF' || _nm.includes('gguf');
  // Image gen models → diffusers
  if (model.is_image_gen || model.is_diffusion || model._tag === 'image') {
    return { backend: 'diffusers', label: 'Diffusers' };
  }
  // AWQ / GPTQ / FP8 are safetensors GPU-serving formats. Never route them
  // through llama.cpp/Ollama just because the host is Mac/Windows; those engines
  // need GGUF. The UI will warn/block on Metal where vLLM/SGLang aren't viable.
  if (isAwqLike) {
    return { backend: 'vllm', label: 'vLLM' };
  }
  // GGUF → llama.cpp/Ollama-compatible.
  if (isGgufLike) {
    return { backend: 'llamacpp', label: 'llama.cpp' };
  }
  // Windows → default to llama.cpp (no vLLM support on Windows)
  if (_isWindows()) {
    return { backend: 'llamacpp', label: 'llama.cpp' };
@@ -278,19 +297,6 @@ export function _detectBackend(model) {
    return { backend: 'llamacpp', label: 'llama.cpp' };
  }
  // AWQ / GPTQ / FP8 → vLLM
  if (/^AWQ|^GPTQ/.test(q) || q === 'FP8') {
    return { backend: 'vllm', label: 'vLLM' };
  }
  // GGUF → llama.cpp. Match the quant tag OR a gguf hint in the repo/path/name:
  // a raw .gguf file often has no quant field, which made it fall through to the
  // vLLM default below.
  const _nm = `${model.repo_id || ''} ${model.path || ''} ${model.name || ''}`.toLowerCase();
  if (model.is_gguf || /^Q[2-8]/.test(q) || /^IQ/.test(q) || q === 'GGUF' || _nm.includes('gguf')) {
    return { backend: 'llamacpp', label: 'llama.cpp' };
  }
  // ROCm/AMD machines should not blindly default HF safetensors models to
  // vLLM. SGLang is the safer OpenAI-compatible default for plain HF text
  // repos there; llama.cpp still wins above whenever the model is GGUF.
@@ -1020,6 +1026,16 @@ function _wireTabEvents(body) {
  // Download input
  const dlBtn = document.getElementById('cookbook-dl-btn');
  const dlInput = document.getElementById('cookbook-dl-repo');
  const dlCardToggle = document.getElementById('cookbook-download-card-toggle');
  const dlCardBody = document.getElementById('cookbook-download-card-body');
  const dlCardArrow = document.getElementById('cookbook-download-card-arrow');
  if (dlCardToggle && dlCardBody) {
    dlCardToggle.addEventListener('click', () => {
      const isOpen = dlCardBody.style.display !== 'none';
      dlCardBody.style.display = isOpen ? 'none' : 'block';
      if (dlCardArrow) dlCardArrow.style.transform = isOpen ? 'rotate(0deg)' : 'rotate(90deg)';
    });
  }
  if (dlBtn && dlInput) {
    function _stripHfUrl(input) {
      let repo = input.trim();
@@ -1099,8 +1115,12 @@ function _wireTabEvents(body) {
  if (hfToggle && hfList) {
    let _loaded = false;
    // Per-server VRAM cache so we don't re-probe on every expand
-    const _vramCache = {};
+    const _hwCache = {};
-    async function _getSelectedServerVram() {
+    function _hfModelLooksAwqLike(m) {
      const text = `${m?.repo_id || ''} ${(m?.tags || []).join(' ')}`.toLowerCase();
      return /\b(awq|gptq|fp8|4bit|int4)\b/.test(text);
    }
    async function _getSelectedServerHw() {
      // Prefer the "What Fits" dropdown (the main control that shows hardware);
      // fall back to the download dropdown. This is the server the list ranks for.
      const dlSrv = document.getElementById('hwfit-server-select') || document.getElementById('hwfit-dl-server');
@@ -1117,7 +1137,7 @@ function _wireTabEvents(body) {
        }
      }
      const cacheKey = host || 'local';
-      if (_vramCache[cacheKey] !== undefined) return _vramCache[cacheKey];
+      if (_hwCache[cacheKey]) return _hwCache[cacheKey];
      // Fetch system info for this server from hwfit
      try {
        const qp = new URLSearchParams();
@@ -1127,13 +1147,13 @@ function _wireTabEvents(body) {
        const r = await fetch(`/api/hwfit/system?${qp}`);
        if (r.ok) {
          const sys = await r.json();
-          const v = sys?.gpu_vram_gb || 0;
+          const hw = { vram: sys?.gpu_vram_gb || 0, backend: String(sys?.backend || '').toLowerCase() };
-          _vramCache[cacheKey] = v;
+          _hwCache[cacheKey] = hw;
-          return v;
+          return hw;
        }
      } catch {}
-      _vramCache[cacheKey] = 0;
+      _hwCache[cacheKey] = { vram: 0, backend: '' };
-      return 0;
+      return _hwCache[cacheKey];
    }
    async function _loadLatest() {
      // Match the Dependencies loader: whirlpool spinner + text label so the
@@ -1152,7 +1172,8 @@ function _wireTabEvents(body) {
      } catch {
        hfList.innerHTML = '<div class="hwfit-loading">Scanning models…</div>';
      }
-      const vram = await _getSelectedServerVram();
+      const hwInfo = await _getSelectedServerHw();
      const vram = hwInfo.vram || 0;
      try {
        let lastErr = '';
        const _fetchLatest = async (v) => {
@@ -1168,6 +1189,9 @@ function _wireTabEvents(body) {
        if (!models.length && vram > 0) {
          models = await _fetchLatest(0);
        }
        if (['rocm', 'metal', 'mps', 'apple', 'generic', 'cpu'].includes(hwInfo.backend)) {
          models = models.filter(m => !_hfModelLooksAwqLike(m));
        }
        if (!models.length) {
          // Distinguish "the HF API failed" from "nothing matched" so an outage
          // doesn't masquerade as no-fitting-models.
@@ -1351,10 +1375,12 @@ function _renderRecipes() {
  // Search group
  html += '<div class="cookbook-group" data-backend-group="Search" style="flex:0 0 auto;">';
  html += '<div class="admin-card" style="display:flex;flex-direction:column;overflow:hidden;">';
-  html += '<div style="display:flex;align-items:baseline;gap:8px;margin-bottom:2px;">';
+  html += '<button type="button" id="cookbook-download-card-toggle" style="display:flex;align-items:baseline;gap:8px;margin-bottom:2px;width:100%;background:transparent;border:0;padding:0;color:inherit;text-align:left;cursor:pointer;">';
  html += '<h2 style="margin:0;padding:0;line-height:1;">Download</h2>';
-  html += '</div>';
+  html += '<span id="cookbook-download-card-arrow" style="margin-left:auto;display:inline-block;transition:transform 0.15s;font-size:13px;line-height:1;">\u25B8</span>';
-  html += '<p class="memory-desc doclib-desc" style="margin-top:6px;">Download from <a href="https://huggingface.co/models" target="_blank" rel="noopener" style="color:var(--accent,var(--red));text-decoration:none;"><svg width="10" height="10" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" style="vertical-align:-1px;margin-right:1px;"><path d="M18 13v6a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2V8a2 2 0 0 1 2-2h6"/><polyline points="15 3 21 3 21 9"/><line x1="10" y1="14" x2="21" y2="3"/></svg>HuggingFace</a> by pasting model link, or download directly in the Scan section below.</p>';
+  html += '</button>';
  html += '<div id="cookbook-download-card-body" style="display:none;">';
  html += '<p class="memory-desc doclib-desc" style="margin-top:6px;">Download directly from Scan, or paste a HuggingFace model link.</p>';
  html += '<div class="hwfit-container" id="hwfit-container">';
  // Section 1: Settings
@@ -1383,7 +1409,7 @@ function _renderRecipes() {
  // silently sending downloads to the wrong server. An empty selection means Local; the user
  // chooses a remote server explicitly via the dropdown.
-  // Download input
+  // Manual download input
  html += `<div style="margin-top:7px;margin-bottom:2px;display:flex;gap:4px;align-items:center;">`;
  if (_es.servers.length > 1) {
    html += `<select class="cookbook-field-input hwfit-dl-server" id="hwfit-dl-server" style="height:28px;position:relative;top:0px;">`;
@@ -1399,7 +1425,7 @@ function _renderRecipes() {
  html += `<button class="cookbook-btn cookbook-dl-btn" id="cookbook-dl-btn">Download</button>`;
  html += `</div>`;
  // Latest HF models that fit — collapsible card list
-  html += `<div style="margin-top:2px;position:relative;top:-8px;">`;
+  html += `<div style="margin-top:5px;position:relative;top:-3px;">`;
  html += `<div style="display:flex;gap:4px;align-items:center;">`;
  html += `<button type="button" class="memory-toolbar-btn" id="cookbook-hf-latest-toggle" style="flex:1;text-align:left;height:26px;display:flex;align-items:center;gap:6px;border-radius:4px;">`;
  html += `<span id="cookbook-hf-latest-arrow" style="display:inline-block;transition:transform 0.15s;pointer-events:none;">\u25B8</span>`;
@@ -1411,7 +1437,7 @@ function _renderRecipes() {
  html += `</div>`;
  // Search section
-  html += '</div></div></div>';
+  html += '</div></div></div></div>';
  html += '<div class="cookbook-group" data-backend-group="Search">';
  html += '<div class="admin-card" style="flex:1;display:flex;flex-direction:column;overflow:hidden;">';
  html += '<div style="display:flex;align-items:baseline;gap:8px;margin-bottom:2px;">';
@@ -86,6 +86,9 @@ function _ggufIncludePattern(model, source) {
 function _missingGgufMessage(model) {
  const name = model?.name || 'this model';
  if (/\bnvfp4\b/i.test(name)) {
    return `${name} is an NVIDIA NVFP4 checkpoint, not a GGUF download. Pick the base model row with an Unsloth GGUF source, or paste the GGUF repo directly.`;
  }
  return `No GGUF source is configured for ${name}. Pick a model with a GGUF source, or paste the GGUF repo in Download.`;
 }
@@ -34,12 +34,106 @@ function _taskBadge(task) {
  return { text: _statusLabel(task.status, task.type), cls: 'cookbook-task-' + task.status };
 }
 function _canClearTask(task) {
  if (!task || task.status === 'running') return false;
  if (task.type === 'serve' && (task.status === 'ready' || task._serveReady)) return false;
  if (task.type === 'download' && task.status === 'done' && !task.payload?._dep) return false;
  return ['done', 'stopped', 'error', 'crashed', 'failed'].includes(task.status);
 }
 function _clearPillLabel(task) {
  return 'clear';
 }
 function _shouldOfferCrashReport(task) {
  if (!task) return false;
  if (task._unreachable && task.type === 'serve') return true;
  return ['error', 'crashed', 'failed'].includes(task.status);
 }
 function _serveTaskLooksAwqOnLocalBackend(task, outputText = '') {
  const repo = `${task?.payload?.repo_id || ''} ${task?.name || ''}`.toLowerCase();
  const cmd = `${task?.payload?._cmd || ''} ${outputText || ''}`.toLowerCase();
  return /\b(awq|gptq|fp8)\b/.test(repo) && /(llama-server|llama_cpp\.server|ollama|ggml_cuda_enable_unified_memory)/.test(cmd);
 }
 function _serveTaskLooksAwqWithoutUsableAccelerator(task, outputText = '') {
  const repo = `${task?.payload?.repo_id || ''} ${task?.name || ''}`.toLowerCase();
  const out = String(outputText || '').toLowerCase();
  return /\b(awq|gptq|fp8)\b/.test(repo)
    && /(no accelerator|no cuda runtime|failed to infer device type|triton is not supported|0 active driver)/i.test(out);
 }
 async function _openDownloadForGgufTask(task) {
  const raw = task?.payload?.repo_id || task?.name || '';
  const modelName = String(raw)
    .split('/').pop()
    .replace(/[-_](?:AWQ|GPTQ|FP8|4bit|8bit|Int4|Int8).*$/i, '')
    .replace(/[-_]+$/g, '')
    || String(raw).split('/').pop()
    || raw;
  const cookbook = window.cookbookModule;
  if (cookbook && typeof cookbook.open === 'function') {
    cookbook.open({ tab: 'Search' });
  } else {
    document.getElementById('tool-cookbook-btn')?.click();
  }
  setTimeout(async () => {
    const modal = document.getElementById('cookbook-modal');
    const tab = modal?.querySelector('.cookbook-tab[data-backend="Search"]');
    if (tab && !tab.classList.contains('active')) tab.click();
    const search = document.getElementById('hwfit-search');
    if (search) {
      search.value = modelName;
      search.dispatchEvent(new Event('input', { bubbles: true }));
      search.focus();
    }
    const quant = document.getElementById('hwfit-quant');
    if (quant) {
      quant.value = 'Q4_K_M';
      quant.dispatchEvent(new Event('change', { bubbles: true }));
    }
    try {
      const hwfit = await import('./cookbook-hwfit.js');
      if (typeof hwfit._hwfitFetch === 'function') hwfit._hwfitFetch(true);
    } catch {}
  }, 80);
 }
 function _terminalServeDiagnosis(task, outputText) {
  const out = String(outputText || task?.output || '');
  if (!task || task.type !== 'serve' || !['stopped', 'error', 'crashed', 'failed'].includes(task.status) || !out.trim()) return null;
  if (_serveTaskLooksAwqOnLocalBackend(task, out)) {
    return {
      message: 'AWQ/GPTQ/FP8 cannot be served through llama.cpp/Ollama unified-memory mode.',
      suggestion: 'Suggested action: use vLLM/SGLang on a compatible CUDA/ROCm GPU server, or download a GGUF version for llama.cpp/Ollama/unified-memory serving.',
      fixes: [
        { label: 'Find GGUF download', action: () => _openDownloadForGgufTask(task) },
        { label: 'Edit serve', action: (panel) => _openServeEditForTask(task) },
      ],
    };
  }
  if (_serveTaskLooksAwqWithoutUsableAccelerator(task, out)) {
    return {
      message: 'AWQ/GPTQ/FP8 needs a working vLLM/SGLang accelerator path; this server did not expose one.',
      suggestion: 'Suggested action: choose a CUDA/ROCm server where vLLM/SGLang can see the GPU, or download a GGUF version and serve it with llama.cpp/Ollama.',
      fixes: [
        { label: 'Find GGUF download', action: () => _openDownloadForGgufTask(task) },
        { label: 'Edit serve', action: (panel) => _openServeEditForTask(task) },
      ],
    };
  }
  return _diagnose(out) || {
    message: /Native llama-server not found|building llama-server|llama\.cpp/i.test(out)
      ? 'llama.cpp build stopped before the server became reachable.'
      : 'Serve stopped before the model became reachable.',
    suggestion: /Native llama-server not found|building llama-server|llama\.cpp/i.test(out)
      ? 'Suggested action: copy the troubleshooting bundle, then edit serve settings. For the quickest local/CPU path, use Ollama or a prebuilt llama-server; source builds can take several minutes and fail if build dependencies are incomplete.'
      : 'Suggested action: copy the troubleshooting bundle, then edit serve settings or relaunch with a CPU/backend fallback.',
    fixes: [{ label: 'Edit serve', action: (panel) => _openServeEditForTask(task) }],
  };
 }
 function _redactCrashReportText(text) {
  if (!text) return '';
  return String(text)
@@ -173,6 +267,23 @@ export function _parseServePhase(snapshot) {
  if (/Ollama API ready on port\s+\d+/i.test(flat)) {
    return { phase: 'ready', status: 'ready' };
  }
  const llamaBuildMatches = [...flat.matchAll(/\[\s*(\d{1,3})%\]\s*(?:Building|Linking)/gi)];
  if (llamaBuildMatches.length) {
    const pct = Math.min(100, parseInt(llamaBuildMatches[llamaBuildMatches.length - 1][1], 10));
    return { phase: `building llama.cpp ${pct}%`, status: 'running', pct };
  }
  if (/Native llama-server not found|building from source/i.test(flat)) {
    if (/Cloning into ['"]?llama\.cpp/i.test(flat) && !/Receiving objects:\s*100%/i.test(flat)) {
      return { phase: 'cloning llama.cpp', status: 'running' };
    }
    if (/Configuring incomplete|CMake Error/i.test(flat)) {
      return {};
    }
    if (/CMAKE_BUILD_TYPE|Detecting CXX|Found Threads|Including CPU backend|CUDA nvcc found|building llama-server/i.test(flat)) {
      return { phase: 'configuring llama.cpp', status: 'running' };
    }
    return { phase: 'building llama.cpp', status: 'running' };
  }
  // HTTP access logs (e.g. GET /v1/models 200 OK) mean the server is up
  if (/(?:GET|POST)\s+\/[^\s]*\s+HTTP\/[\d.]+"\s*\d{3}/.test(flat)) {
    return { phase: 'idle', status: 'ready' };
@@ -341,8 +452,24 @@ async function _startQueuedDownload(task) {
 // ── Task CRUD ──
 function _serveOutputLooksReady(task) {
  const out = String(task?.output || '');
  return !!task?._serveReady
    || /Application startup complete/i.test(out)
    || /Ollama API ready on port\s+\d+/i.test(out)
    || /(?:GET|POST)\s+\/[^\s]*\s+HTTP\/[\d.]+"\s*2\d\d/i.test(out);
 }
 function _normalizeTaskForDisplay(task) {
  if (!task || typeof task !== 'object') return task;
  if (task.type === 'serve' && task.status === 'done' && !_serveOutputLooksReady(task)) {
    return { ...task, status: 'error' };
  }
  return task;
 }
 export function _loadTasks() {
-  try { return JSON.parse(localStorage.getItem(TASKS_KEY)) || []; }
+  try { return (JSON.parse(localStorage.getItem(TASKS_KEY)) || []).map(_normalizeTaskForDisplay); }
  catch { return []; }
 }
@@ -876,7 +1003,7 @@ export async function _serveAutoFix(panel, envVar) {
 // Edit button, but optionally with a modified command (used by the diagnosis
 // "Retry with X" buttons so a retry lands in the editable Serve panel with the
 // adjusted setting, instead of blindly relaunching).
-async function _openServeEditForTask(task, cmdOverride) {
+async function _openServeEditForTask(task, cmdOverride, fieldOverrides = null) {
  const repo = task.payload?.repo_id;
  if (!repo) { uiModule.showToast('No model info on this task'); return; }
  const cmd = cmdOverride || task.payload?._cmd;
@@ -884,6 +1011,9 @@ async function _openServeEditForTask(task, cmdOverride) {
  let fields = cmdOverride
    ? _parseServeCmdToFields(cmd)
    : (task.payload?._fields || (cmd ? _parseServeCmdToFields(cmd) : null));
  if (fieldOverrides && typeof fieldOverrides === 'object') {
    fields = { ...(fields || {}), ...fieldOverrides };
  }
  // Switch the active server to the one this serve ran on (mirrors _openEdit).
  const _tHost = task.remoteHost || '';
  _envState.remoteHost = _tHost;
@@ -1352,8 +1482,8 @@ export function _renderRunningTab() {
      const host = btn.dataset.clearServer;
      if (!await window.styledConfirm(`Clear finished tasks on ${_serverName(host)}?`, { confirmText: 'Clear' })) return;
      const allTasks = _loadTasks();
-      const toRemove = allTasks.filter(t => (t.remoteHost || '') === host && t.status !== 'running');
+      const toRemove = allTasks.filter(t => (t.remoteHost || '') === host && _canClearTask(t));
-      const remaining = allTasks.filter(t => (t.remoteHost || '') !== host || t.status === 'running');
+      const remaining = allTasks.filter(t => (t.remoteHost || '') !== host || !_canClearTask(t));
      _saveTasks(remaining);
      // Fade/slide each finished card out (same exit as the per-card clear)
      // instead of yanking them instantly.
@@ -1443,16 +1573,19 @@ export function _renderRunningTab() {
        const _bdg = _taskBadge(task);
        badge.textContent = _bdg.text;
        badge.className = 'cookbook-task-status' + (_bdg.cls ? ' ' + _bdg.cls : '');
-        badge.style.display = isDone ? 'none' : '';   // hidden — type chip carries it
+        badge.style.display = '';
      }
      // Indicator: spinning wave while running, green check when finished.
      const wave = el.querySelector('.cookbook-task-wave');
      if (wave) wave.style.display = task.status === 'running' ? '' : 'none';
      // Model downloads (which have a Serve → button) don't get a clear pill —
      // pressing Serve clears them. Dep installs / serve tasks keep it.
      const check = el.querySelector('.cookbook-task-check');
-      const _showClear = isDone && !(task.type === 'download' && !task.payload?._dep);
+      if (check) {
-      if (check) check.style.display = _showClear ? '' : 'none';
+        check.style.display = _canClearTask(task) ? '' : 'none';
        const label = check.querySelector('.cookbook-task-done-label');
        if (label) label.textContent = _clearPillLabel(task);
      }
      const terminalDiag = _terminalServeDiagnosis(task, el.querySelector('.cookbook-output-pre')?.textContent || task.output || '');
      if (terminalDiag) _showDiagnosis(el, terminalDiag, el.querySelector('.cookbook-output-pre')?.textContent || task.output || '');
    }
    if (!task) {
      if (el._uptimeInterval) { clearInterval(el._uptimeInterval); el._uptimeInterval = null; }
@@ -1476,11 +1609,8 @@ export function _renderRunningTab() {
      <div class="cookbook-task-header">
        <span class="cookbook-task-type${(task.status === 'done' && task.type === 'download') ? ' cookbook-task-type-done' : ''}" data-type="${esc(task.type)}">${esc((task.status === 'done' && task.type === 'download') ? 'finished' : task.type)}</span>
        <span class="cookbook-task-name">${modelLogo(task.name)}${esc(task.name)}</span>
-        <span class="cookbook-task-status ${_bdg.cls}" style="display:${task.status === 'done' ? 'none' : ''}"${_bdgTitle}>${esc(_bdg.text)}</span>
+        <span class="cookbook-task-indicator"><span class="cookbook-task-wave" style="display:${task.status === 'running' ? '' : 'none'}"></span><span class="cookbook-task-check" title="Clear" style="display:${_canClearTask(task) ? '' : 'none'}"><svg class="cookbook-task-check-ico" width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="#50fa7b" stroke-width="3" stroke-linecap="round" stroke-linejoin="round"><polyline points="20 6 9 17 4 12"/></svg><svg class="cookbook-task-clear-ico" width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="3" stroke-linecap="round" stroke-linejoin="round"><line x1="18" y1="6" x2="6" y2="18"/><line x1="6" y1="6" x2="18" y2="18"/></svg><span class="cookbook-task-done-label">${esc(_clearPillLabel(task))}</span><span class="cookbook-task-clear-label">clear</span></span></span>
-        ${task.type === 'serve' && task.payload?._cmd ? '<button class="cookbook-task-edit-btn" title="Edit settings &amp; relaunch"><svg width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M11 4H4a2 2 0 0 0-2 2v14a2 2 0 0 0 2 2h14a2 2 0 0 0 2-2v-7"/><path d="M18.5 2.5a2.121 2.121 0 0 1 3 3L12 15l-4 1 1-4 9.5-9.5z"/></svg></button>' : ''}
+        <span class="cookbook-task-status ${_bdg.cls}"${_bdgTitle}>${esc(_bdg.text)}</span>
        ${task.type === 'serve' && task.payload?._cmd ? '<button class="cookbook-task-save-btn" title="Save preset"><svg width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M19 21H5a2 2 0 0 1-2-2V5a2 2 0 0 1 2-2h11l5 5v11a2 2 0 0 1-2 2z"/><polyline points="17 21 17 13 7 13 7 21"/><polyline points="7 3 7 8 15 8"/></svg></button>' : ''}
        <span class="cookbook-task-indicator"><span class="cookbook-task-wave" style="display:${task.status === 'running' ? '' : 'none'}"></span><span class="cookbook-task-check" title="Clear" style="display:${(task.status === 'done' && !(task.type === 'download' && !task.payload?._dep)) ? '' : 'none'}"><svg class="cookbook-task-check-ico" width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="#50fa7b" stroke-width="3" stroke-linecap="round" stroke-linejoin="round"><polyline points="20 6 9 17 4 12"/></svg><svg class="cookbook-task-clear-ico" width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="3" stroke-linecap="round" stroke-linejoin="round"><line x1="18" y1="6" x2="6" y2="18"/><line x1="6" y1="6" x2="18" y2="18"/></svg><span class="cookbook-task-done-label">done</span><span class="cookbook-task-clear-label">clear</span></span></span>
        ${task.type === 'download' && !task.payload?._dep && task.status === 'done' ? `<span class="cookbook-task-status cookbook-task-done">finished</span>` : ''}
        <button class="cookbook-task-menu-btn" title="Actions">&#8942;</button>
      </div>
      <div class="cookbook-task-sub"><span class="cookbook-task-session">${esc(task.sessionId)}</span><span class="cookbook-task-uptime" style="display:${((task.type === 'serve' || task.type === 'download') && task.status === 'running') ? '' : 'none'}"></span></div>
@@ -1490,6 +1620,9 @@ export function _renderRunningTab() {
    const _waveEl = el.querySelector('.cookbook-task-wave');
    if (_waveEl && task.status === 'running') _registerWaveEl(_waveEl);
    const terminalDiag = _terminalServeDiagnosis(task, task.output || '');
    if (terminalDiag) _showDiagnosis(el, terminalDiag, task.output || '');
    const _uptimeEl = el.querySelector('.cookbook-task-uptime');
    if (_uptimeEl && (task.type === 'serve' || task.type === 'download') && task.status === 'running') {
      const _startedAt = task.ts || Date.now();
@@ -1506,35 +1639,12 @@ export function _renderRunningTab() {
    }
    // Re-open the Serve panel for this model, pre-filled with the EXACT
-    // settings this instance launched with, and on the SERVER it runs on —
+    // settings this instance launched with, and on the SERVER it runs on.
    // shared by the edit icon button and the ⋮ "Edit settings" menu item.
    const _openEdit = () => _openServeEditForTask(task);
-    const editBtn = el.querySelector('.cookbook-task-edit-btn');
+    el.addEventListener('cookbook:edit-serve', (e) => {
-    if (editBtn) {
+      e.stopPropagation();
-      editBtn.addEventListener('click', (e) => { e.stopPropagation(); _openEdit(); });
+      _openServeEditForTask(task, null, e.detail?.fields || null);
-    }
+    });
    // Wire save icon button
    const saveBtn = el.querySelector('.cookbook-task-save-btn');
    if (saveBtn) {
      saveBtn.addEventListener('click', async (e) => {
        e.stopPropagation();
        // Tell them it's already saved up front (often true now that working
        // configs auto-save) instead of after they've typed a name.
        if (_loadPresets().some(p => p.cmd === task.payload?._cmd)) {
          uiModule.showToast('Already saved');
          return;
        }
        const label = (await uiModule.styledPrompt('Name this config so you can recall it later.', {
          title: 'Save Config', defaultValue: task.name, placeholder: 'e.g. 8-bit, fast', confirmText: 'Save',
        }) || '').trim();
        if (!label) return;
        if (!_saveTaskAsPreset(task, label)) { uiModule.showToast('Already saved'); return; }
        saveBtn.innerHTML = '<svg width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="#50fa7b" stroke-width="2.5" stroke-linecap="round"><polyline points="20 6 9 17 4 12"/></svg>';
        uiModule.showToast(`Saved "${label}"`);
        setTimeout(() => { saveBtn.style.display = 'none'; }, 1500);
      });
    }
    // Finished download → an explicit "Serve →" button jumps straight to the
    // Serve tab with this model pre-selected (on the server it downloaded to).
@@ -2018,12 +2128,31 @@ async function _reconnectTask(el, task) {
          if (badge) { badge.textContent = _statusLabel('error', task.type); badge.className = 'cookbook-task-status cookbook-task-error'; }
          _showCookbookNotif(true);
        } else {
-          const looksSuccessful = !lastOutput.includes('DOWNLOAD_FAILED') && (lastOutput.includes('DONE') || lastOutput.includes('100%') || lastOutput.includes('Application startup complete') || lastOutput.includes('/snapshots/') || lastOutput.includes('Download complete') || lastOutput.includes('DOWNLOAD_OK'));
+          const downloadLooksSuccessful = !lastOutput.includes('DOWNLOAD_FAILED')
-          if (!lastOutput.trim() || (task.type === 'download' && !looksSuccessful)) {
+            && (lastOutput.includes('DONE') || lastOutput.includes('100%') || lastOutput.includes('/snapshots/') || lastOutput.includes('Download complete') || lastOutput.includes('DOWNLOAD_OK'));
          const serveLooksReady = task.type === 'serve' && _serveOutputLooksReady({ ...task, output: lastOutput });
          const looksSuccessful = task.type === 'download' ? downloadLooksSuccessful : serveLooksReady;
          if (!lastOutput.trim() || !looksSuccessful) {
            _updateTask(task.sessionId, { status: 'crashed' });
            el.dataset.status = 'crashed';
            const badge = el.querySelector('.cookbook-task-status');
            if (badge) { badge.textContent = _statusLabel('crashed', task.type); badge.className = 'cookbook-task-status cookbook-task-crashed'; }
            if (task.type === 'serve') {
              const diag = _diagnose(lastOutput) || {
                message: _serveTaskLooksAwqOnLocalBackend(task, lastOutput)
                  ? 'AWQ/GPTQ/FP8 cannot be served through llama.cpp/Ollama unified-memory mode.'
                  : /Native llama-server not found|building llama-server|llama\.cpp/i.test(lastOutput)
                  ? 'llama.cpp build stopped before the server became reachable.'
                  : 'Serve stopped before the model became reachable.',
                suggestion: _serveTaskLooksAwqOnLocalBackend(task, lastOutput)
                  ? 'Suggested action: use vLLM/SGLang on a compatible CUDA/ROCm GPU server, or download a GGUF version for llama.cpp/Ollama/unified-memory serving.'
                  : /Native llama-server not found|building llama-server|llama\.cpp/i.test(lastOutput)
                  ? 'Suggested action: copy the troubleshooting bundle, then edit serve settings. For the quickest local/CPU path, use Ollama or a prebuilt llama-server; source builds can take several minutes and fail if build dependencies are incomplete.'
                  : 'Suggested action: copy the troubleshooting bundle, then edit serve settings or relaunch with a CPU/backend fallback.',
                fixes: [{ label: 'Edit serve', action: (panel) => _openServeEditForTask(task) }],
              };
              _showDiagnosis(el, diag, lastOutput);
            }
            _showCookbookNotif(true);
          } else {
            _updateTask(task.sessionId, { status: 'done' });
@@ -41,6 +41,48 @@ const SERVE_STATE_KEY = 'cookbook-serve-state';
 let _cachedAllModels = [];
 function _repoLooksAwqLike(model, repo) {
  const q = String(model?.quant || '').toUpperCase();
  const n = `${repo || ''} ${model?.repo_id || ''} ${model?.name || ''} ${model?.path || ''}`.toLowerCase();
  return /^AWQ|^GPTQ/.test(q) || q === 'FP8' || /\b(awq|gptq|fp8)\b/i.test(n);
 }
 function _repoLooksGgufLike(model, repo) {
  const q = String(model?.quant || '').toUpperCase();
  const n = `${repo || ''} ${model?.repo_id || ''} ${model?.name || ''} ${model?.path || ''}`.toLowerCase();
  return !!model?.is_gguf || /^Q[2-8]/.test(q) || /^IQ/.test(q) || q === 'GGUF' || n.includes('gguf');
 }
 function _serveBackendWarning(model, repo, backend, fields = {}) {
  const awqLike = _repoLooksAwqLike(model, repo);
  const ggufLike = _repoLooksGgufLike(model, repo);
  if (awqLike && (backend === 'llamacpp' || backend === 'ollama')) {
    return {
      title: 'AWQ needs vLLM or SGLang',
      body: 'This model looks like AWQ/GPTQ/FP8 safetensors. llama.cpp and Ollama need GGUF files, so this backend cannot serve it. Choose vLLM/SGLang on a CUDA/ROCm GPU server, or download a GGUF version for llama.cpp/Ollama.',
    };
  }
  if (awqLike && _isMetal() && (backend === 'vllm' || backend === 'sglang')) {
    return {
      title: 'AWQ is not a unified-memory path',
      body: 'This model looks like AWQ/GPTQ/FP8 safetensors. AWQ is for vLLM/SGLang on CUDA/ROCm-style GPU servers, not local unified-memory llama.cpp/Ollama serving. For unified memory, download a GGUF model and use llama.cpp/Ollama.',
    };
  }
  if (awqLike && fields.unified_mem) {
    return {
      title: 'AWQ is not a unified-memory path',
      body: 'This model looks like AWQ/GPTQ/FP8 safetensors, but unified-memory local serving expects GGUF. Use vLLM/SGLang on a compatible GPU server, or download a GGUF version for llama.cpp/Ollama.',
    };
  }
  if (ggufLike && (backend === 'vllm' || backend === 'sglang')) {
    return {
      title: 'GGUF needs llama.cpp or Ollama',
      body: 'This model looks like GGUF. vLLM/SGLang expect HuggingFace safetensors-style repos. Choose llama.cpp/Ollama for GGUF, or download a safetensors model for vLLM/SGLang.',
    };
  }
  return null;
 }
 function _hasOwn(obj, key) {
  return Object.prototype.hasOwnProperty.call(obj || {}, key);
 }
@@ -324,12 +366,6 @@ function _rerenderCachedModels() {
        c.style.alignItems = '';
      });
      // Capture grid height
      const _tb = list.closest('.admin-card')?.querySelector('.memory-toolbar');
      const _tbH = _tb ? _tb.offsetHeight : 0;
      list.style.minHeight = (list.offsetHeight + _tbH) + 'px';
      list.style.maxHeight = (list.offsetHeight + _tbH) + 'px';
      const shortName = repo.split('/').pop();
      const _es = _envState;
      // The venv set per-server in Settings (server.envPath). Used as the venv
@@ -350,8 +386,13 @@ function _rerenderCachedModels() {
        ? _byRepo[repo]
        : (_lastUsed || (_isLegacyFlat ? _allSs : {}));
      const detectedBackend = _detectBackend(m).backend;
-      const defaultBackend = detectedBackend;
+      const _allowedBackends = new Set(_isWindows()
-      const savedMatchesBackend = (ss.backend || 'vllm') === detectedBackend;
+        ? ['llamacpp']
        : (_isMetal() ? ['llamacpp', 'ollama'] : ['vllm', 'sglang', 'llamacpp', 'ollama', 'diffusers']));
      const defaultBackend = (ss._forceBackend && ss.backend && _allowedBackends.has(ss.backend))
        ? ss.backend
        : detectedBackend;
      const savedMatchesBackend = !!ss._forceBackend || (ss.backend || 'vllm') === detectedBackend;
      const sv = (k, def) => (ss[k] !== undefined && savedMatchesBackend) ? ss[k] : def;
      const defaultTp = defaultBackend === 'llamacpp' ? '1' : sv('tp', '1');
      const detectedGpuIds = _allGpuIds(_getGpuToggleTotal?.());
@@ -1200,7 +1241,16 @@ function _rerenderCachedModels() {
          if (el.type === 'checkbox') serveState[el.dataset.field] = el.checked;
          else serveState[el.dataset.field] = el.value;
        });
-        serveState.backend = (_detectBackend(m).backend) || serveState.backend || 'vllm';
+        serveState.backend = serveState.backend || (_detectBackend(m).backend) || 'vllm';
        const backendWarning = _serveBackendWarning(m, repo, serveState.backend, serveState);
        if (backendWarning) {
          await window.styledConfirm(backendWarning.body, {
            title: backendWarning.title,
            confirmText: 'Edit settings',
            cancelText: 'Close',
          });
          return;
        }
        // Save in the { _byRepo, _lastUsed } schema — no legacy flat keys at
        // the root so per-model state doesn't leak between models.
        try {
@@ -2253,8 +2253,9 @@ function _renderActivityEntry(entry) {
  const hue = _categoryHue(entry.taskName, entry.kind);
  // CSS vars feed the colored title + accent stripe.
  const styleVars = `--cat-hue:${hue};`;
  const _runningPlaceholder = /^(Starting…|Starting\.\.\.|_Running…_|_Running\.\.\._|_Queued\b)/i.test((entry.result || '').trim());
  const hasResult = !!(entry.result && entry.result.trim() && entry.status !== 'running' && entry.status !== 'queued');
-  const hasRunningProgress = !!(entry.result && entry.result.trim() && (entry.status === 'running' || entry.status === 'queued'));
+  const hasRunningProgress = !!(entry.result && entry.result.trim() && !_runningPlaceholder && (entry.status === 'running' || entry.status === 'queued'));
  // "Open in chat" only makes sense for runs whose result is a real assistant
  // message (Prompt / Research tasks). Action/event runs are just log lines
  // (e.g. "No recent emails", "Tidied N memories") — for those, replace the
@@ -2299,9 +2300,10 @@ function _renderActivityEntry(entry) {
  let rightHtml;
  if (_isRunning) {
    const isQueued = entry.status === 'queued';
    const label = isQueued ? 'Queued' : 'Running';
    // Initial elapsed for the first paint; the 1s interval below keeps it live.
    const startMs = entry.ts ? new Date(entry.ts).getTime() : Date.now();
    const stale = !isQueued && (Date.now() - startMs) > 30 * 60 * 1000;
    const label = isQueued ? 'Queued' : stale ? 'Still running' : 'Running';
    const elapsedInit = isQueued ? '' : `<span class="task-log-running-elapsed" data-since="${startMs}">${_fmtElapsed(Date.now() - startMs)}</span>`;
    const forceBtn = isQueued && entry.taskId ? `<button class="task-log-force-run" type="button" title="Start now in parallel, bypassing the queue" style="border:0;background:transparent;box-shadow:none;margin-left:5px;padding:0;width:12px;height:12px;display:inline-flex;align-items:center;justify-content:center;font-size:10px;line-height:1;color:inherit;opacity:.8;"><svg width="9" height="9" viewBox="0 0 24 24" fill="currentColor" style="display:block;"><polygon points="6 4 20 12 6 20 6 4"/></svg></button>` : '';
    const stopBtn = entry.taskId ? `<button class="task-log-stop" type="button" title="Stop this task"><svg width="9" height="9" viewBox="0 0 24 24" fill="currentColor"><rect x="6" y="6" width="12" height="12" rx="1"/></svg></button>` : '';
@@ -5363,19 +5363,20 @@ body.bg-pattern-sparkles {
    #compare-model-overlay .modal-header h4 {
      pointer-events: none;
    }
-    /* Compare modal sizes to content — the global .modal-content max-height
+    /* Compare model selector: keep manually-resized/tiny windows contained.
-       + .modal-body overflow combo makes BOTH the outer card and the inner
+       Picker dropdowns are appended to document.body, so the card itself can
-       body scrollable, so even when the content fits the viewport you get
+       clip and scroll without cropping the dropdown list. */
       a stray vertical scrollbar. Drop the cap and disable inner scroll
       here; if the viewport is genuinely tiny the modal still won't exceed
       it because it's centered and the parent .modal flex layout shrinks. */
    #compare-model-overlay .modal-content {
-      max-height: none;
+      display: flex;
-      overflow: visible;
+      flex-direction: column;
      max-height: min(720px, calc(100dvh - 48px));
      overflow: hidden;
      min-height: 180px;
    }
    #compare-model-overlay .modal-body {
-      overflow: visible;
+      overflow: auto;
-      flex: 0 0 auto;
+      flex: 1 1 auto;
      min-height: 0;
    }
    .vis-hint {
      font-size: 10px;
@@ -6955,6 +6956,8 @@ pre { background: var(--code-bg, var(--hl-bg, #282c34)) !important; }
    .compare-mode-tabs {
      display: flex;
      gap: 4px;
      flex-wrap: wrap;
      min-width: 0;
    }
    /* Type tabs match Mode toggles 1:1 (same flex column layout, same metrics) */
    .compare-mode-tab {
@@ -19015,7 +19018,7 @@ body.gallery-selecting .gallery-dl-btn,
  align-items: center;
  gap: 3px;
  position: relative;
-  top: 2px;
+  top: 0;
  cursor: pointer;
  padding: 1px 6px 1px 4px;
  border-radius: 9px;
@@ -19024,22 +19027,17 @@ body.gallery-selecting .gallery-dl-btn,
 }
 .cookbook-task-check svg { flex-shrink: 0; }
 .cookbook-task-check:hover { background: color-mix(in srgb, var(--red, #ff5555) 18%, transparent); }
-/* Shows "done" (green) normally; on hover the icon + label swap to a red ✕ /
+/* Terminal task clear pill. */
   "clear" to reveal it's a dismiss action. */
 .cookbook-task-done-label,
 .cookbook-task-clear-label {
  font-size: 9px;
  line-height: 1;
  text-transform: lowercase;
 }
-.cookbook-task-done-label { color: var(--green, #50fa7b); }
+.cookbook-task-done-label { color: var(--red, #ff5555); }
-.cookbook-task-clear-label { display: none; color: var(--red, #ff5555); }
+.cookbook-task-clear-label { display: none; }
-.cookbook-task-check:hover .cookbook-task-done-label { display: none; }
+.cookbook-task-check-ico { display: none; }
-.cookbook-task-check:hover .cookbook-task-clear-label { display: inline; }
+.cookbook-task-clear-ico { display: inline; }
 /* Default: show the green check. On hover: swap to a red ✕ to signal "clear". */
 .cookbook-task-clear-ico { display: none; }
 .cookbook-task-check:hover .cookbook-task-check-ico { display: none; }
 .cookbook-task-check:hover .cookbook-task-clear-ico { display: inline; }
 /* "Serve" button on a finished download — green pill matching the "running" /
   finished badge (it sits next to the green FINISHED chip + check). */
 .cookbook-task-serve-btn {
@@ -19583,17 +19581,136 @@ body.gallery-selecting .gallery-dl-btn,
  border: 1px solid color-mix(in srgb, var(--color-error) 30%, transparent);
  border-radius: 6px;
 }
 .cookbook-diag-header {
  display: flex;
  align-items: center;
  gap: 7px;
  position: relative;
  top: -4px;
  margin-bottom: -4px;
 }
 .cookbook-diag-fold {
  display: inline-flex;
  align-items: center;
  gap: 5px;
  padding: 0;
  min-height: 0;
  border: 0;
  background: transparent;
  color: var(--color-error);
  font: inherit;
  font-size: 11px;
  font-weight: 700;
  cursor: pointer;
  margin-right: auto;
 }
 .cookbook-diag-fold:hover {
  background: transparent;
  color: var(--color-error);
  opacity: 0.85;
 }
 .cookbook-diag-chevron {
  display: inline-block;
  width: 10px;
  font-size: 10px;
 }
 .cookbook-diag-copy {
  border: 0;
  background: transparent;
  color: var(--fg-muted);
  padding: 0 2px;
  width: 18px;
  height: 18px;
  min-height: 18px;
  cursor: pointer;
  display: inline-flex;
  align-items: center;
  justify-content: center;
 }
 .cookbook-diag-copy:hover {
  background: transparent;
  color: var(--fg);
 }
 .cookbook-diag-copy.copied {
  color: var(--green, #50fa7b);
 }
 .cookbook-diag-copy svg {
  display: block;
 }
 .cookbook-diag-dismiss {
  border: 0;
  background: transparent;
  color: var(--fg-muted);
  padding: 0;
  width: 16px;
  height: 18px;
  min-height: 18px;
  line-height: 16px;
  font-size: 13px;
  cursor: pointer;
  display: inline-flex;
  align-items: center;
  justify-content: center;
  position: relative;
  top: -2px;
 }
 .cookbook-diag-dismiss:hover {
  background: transparent;
  color: var(--color-error);
 }
 .cookbook-diag-body {
  margin-top: 7px;
 }
 .cookbook-diag-message {
  font-size: 12px;
  font-weight: 600;
  color: var(--color-error);
  margin-bottom: 4px;
  margin-left: 2px;
  user-select: text;
 }
 .cookbook-diag-suggestion {
  font-size: 11px;
  line-height: 1.35;
  color: var(--fg-muted);
  margin-bottom: 8px;
  margin-left: 2px;
  user-select: text;
 }
 .cookbook-diag-fixes {
  display: flex;
  flex-wrap: wrap;
  gap: 6px;
 }
 .cookbook-diag-actions {
  position: relative;
  display: inline-flex;
 }
 .cookbook-diag-action-trigger {
  font-size: 11px;
  padding: 4px 10px;
  min-height: 24px;
  background: var(--panel);
  border: 1px solid color-mix(in srgb, var(--color-error) 40%, transparent);
  color: var(--fg);
 }
 .cookbook-diag-action-trigger:hover {
  border-color: var(--color-error);
  background: color-mix(in srgb, var(--color-error) 12%, transparent);
 }
 .cookbook-diag-menu {
  position: absolute;
  left: 0;
  top: calc(100% + 4px);
  min-width: 180px;
  z-index: 80;
 }
 .cookbook-diag-menu button {
  width: 100%;
  justify-content: flex-start;
  text-align: left;
  white-space: nowrap;
 }
 .cookbook-diag-btn {
  font-size: 11px;
  padding: 4px 10px;