feat(cookbook): surface Docker hardware visibility warnings (#3658)

This commit is contained in:
Karthik Rajesh
2026-06-15 07:51:04 +01:00
committed by GitHub
parent 2cf8bd14ae
commit 674457384a
4 changed files with 295 additions and 0 deletions
+90
View File
@@ -611,6 +611,93 @@ def _cache_key(host: str, ssh_port: str, platform_name: str):
)
def _is_containerized():
"""Best-effort check for whether the local Odysseus process is running in a container."""
if _remote_host:
return False
if os.path.exists("/.dockerenv"):
return True
try:
with open("/proc/1/cgroup", encoding="utf-8", errors="replace") as f:
text = f.read().lower()
return any(marker in text for marker in ("docker", "containerd", "kubepods"))
except Exception:
return False
def _hardware_visibility_warning(result):
"""Return a non-blocking UX warning when detected hardware may only be container-visible."""
if not isinstance(result, dict):
return None
if result.get("manual_hardware"):
return None
if not result.get("containerized"):
return None
if result.get("gpu_error"):
return None
if not result.get("has_gpu"):
return {
"code": "container_no_gpu_visible",
"severity": "warning",
"title": "No GPU visible inside Docker",
"message": (
"Cookbook is scanning hardware from inside the Odysseus container. "
"If your host has a GPU, Docker may not be exposing it to the container, "
"so model recommendations may be CPU-only or too conservative."
),
"actions": [
"manual_hardware",
"rescan",
"copy_diagnostics",
],
}
total_ram = result.get("total_ram_gb") or 0
if total_ram and total_ram <= 8:
return {
"code": "container_low_ram_visible",
"severity": "info",
"title": "Container-visible RAM may be lower than host RAM",
"message": (
"Cookbook is seeing the RAM available inside the container. "
"If your host has more memory, validate host RAM separately or use Manual Hardware."
),
"actions": [
"manual_hardware",
"rescan",
"copy_diagnostics",
],
}
return None
def _attach_probe_context(result, host=""):
"""Attach probe-scope metadata and optional hardware visibility warning."""
if not isinstance(result, dict) or result.get("error"):
return result
is_remote = bool(host)
containerized = False if is_remote else _is_containerized()
result["probe_scope"] = "remote" if is_remote else ("container" if containerized else "native")
result["containerized"] = containerized
warning = _hardware_visibility_warning(result)
if warning:
result["hardware_visibility_warning"] = warning
else:
result.pop("hardware_visibility_warning", None)
return result
def detect_system(host="", ssh_port="", platform="", fresh=False):
"""Detect system hardware: RAM, CPU, GPU. Cached per host (hardware rarely
changes, and probing a remote host over SSH is slow). Pass fresh=True to
@@ -635,6 +722,7 @@ def detect_system(host="", ssh_port="", platform="", fresh=False):
if _remote_platform == "windows" and _remote_host:
result = _detect_windows()
if result:
result = _attach_probe_context(result, host=host)
_remote_host = None
_remote_platform = None
_cache_by_host[cache_key] = (now, result)
@@ -653,6 +741,7 @@ def detect_system(host="", ssh_port="", platform="", fresh=False):
if not _remote_host and os.name == "nt":
result = _detect_windows()
if result:
result = _attach_probe_context(result, host=host)
_cache_by_host[cache_key] = (now, result)
return result
# PowerShell probe failed entirely — fall through to the generic path
@@ -714,6 +803,7 @@ def detect_system(host="", ssh_port="", platform="", fresh=False):
"gpu_error": _last_gpu_error,
}
result = _attach_probe_context(result, host=host)
_remote_host = None
_remote_platform = None
_cache_by_host[cache_key] = (now, result)
+75
View File
@@ -750,6 +750,80 @@ export async function _hwfitFetch(fresh = false) {
}
}
// Renders a non-blocking hardware visibility warning when Cookbook is using
// container-visible hardware that may not match the user's actual host machine.
function _renderHwVisibilityWarning(sys) {
const row = document.getElementById('hwfit-hw-row');
if (!row) return;
let box = document.getElementById('hwfit-hw-visibility-warning');
// Manual hardware is an explicit user override, so avoid showing stale
// container-detection warnings once the user has chosen a simulated profile.
const warning = sys?.manual_hardware ? null : sys?.hardware_visibility_warning;
if (!warning) {
if (box) box.remove();
return;
}
if (!box) {
box = document.createElement('div');
box.id = 'hwfit-hw-visibility-warning';
box.className = 'hwfit-loading hwfit-hw-visibility-warning';
row.insertAdjacentElement('afterend', box);
}
box.innerHTML = `
<div class="hwfit-hw-visibility-warning-title">${esc(warning.title || 'Hardware visibility note')}</div>
<div class="hwfit-hw-visibility-warning-body">${esc(warning.message || '')}</div>
<div class="hwfit-hw-visibility-warning-actions">
<button type="button" class="hwfit-gpu-btn" data-hw-action="manual">Edit manual hardware</button>
<button type="button" class="hwfit-gpu-btn" data-hw-action="rescan">Rescan</button>
<button type="button" class="hwfit-gpu-btn" data-hw-action="copy">Copy diagnostics</button>
</div>
`;
box.querySelector('[data-hw-action="manual"]')?.addEventListener('click', () => {
const panel = document.getElementById('hwfit-manual-panel');
if (panel) panel.classList.remove('hidden');
document.getElementById('hwfit-hw-manual-btn')?.scrollIntoView?.({
behavior: 'smooth',
block: 'center',
});
});
box.querySelector('[data-hw-action="rescan"]')?.addEventListener('click', () => {
_resetGpuToggleState();
_hwfitCache = null;
_hwfitFetch(true);
});
box.querySelector('[data-hw-action="copy"]')?.addEventListener('click', () => {
// Keep diagnostics copy/paste friendly for GitHub issues and Docker support.
const text = [
'Odysseus Cookbook hardware diagnostics',
`probe_scope=${sys?.probe_scope || ''}`,
`containerized=${sys?.containerized === true}`,
`backend=${sys?.backend || ''}`,
`has_gpu=${sys?.has_gpu === true}`,
`gpu_name=${sys?.gpu_name || ''}`,
`gpu_count=${sys?.gpu_count || 0}`,
`gpu_vram_gb=${sys?.gpu_vram_gb || ''}`,
`ram=${sys?.available_ram_gb || '?'} / ${sys?.total_ram_gb || '?'} GB`,
`cpu_cores=${sys?.cpu_cores || ''}`,
`cpu_name=${sys?.cpu_name || ''}`,
'',
'Useful checks:',
'docker compose exec odysseus nvidia-smi -L',
'docker compose exec odysseus cat /proc/meminfo | head',
'docker compose exec odysseus python -c "from services.hwfit.hardware import detect_system; import json; print(json.dumps(detect_system(fresh=True), indent=2))"',
].join('\n');
_copyText(text);
});
}
export function _hwfitRenderHw(el, sys) {
if (!el || !sys) return;
// Cache system info globally so other modules can read VRAM without refetching
@@ -838,6 +912,7 @@ export function _hwfitRenderHw(el, sys) {
+ chip('cores', cores)
+ chip('backend', esc(sys.backend || ''))
+ manualChip;
_renderHwVisibilityWarning(sys);
// Body click → toggle "off" (dimmed, still visible). Membership of
// _dismissedHwChips is what the ranker reads, so both add+remove
// here also flips the model list. The manual chip is excluded —
+20
View File
@@ -21246,6 +21246,26 @@ body.gallery-selecting .gallery-dl-btn,
display: flex; align-items: center; justify-content: center;
color: var(--fg-muted); padding: 16px 0; font-size: 12px;
}
.hwfit-hw-visibility-warning {
display: flex;
flex-direction: column;
align-items: flex-start;
gap: 8px;
text-align: left;
margin-top: 8px;
}
.hwfit-hw-visibility-warning-title {
font-weight: 600;
}
.hwfit-hw-visibility-warning-body {
opacity: 0.78;
line-height: 1.45;
}
.hwfit-hw-visibility-warning-actions {
display: flex;
gap: 8px;
flex-wrap: wrap;
}
.hwfit-row {
display: flex; align-items: center; gap: 6px; padding: 5px 8px;
border-radius: 6px; cursor: pointer; font-size: 11px;
@@ -0,0 +1,110 @@
"""Tests for Cookbook hardware probe context and container visibility warnings."""
import pytest
from services.hwfit import hardware
@pytest.mark.area_services
@pytest.mark.area_unit
def test_container_no_gpu_gets_visibility_warning(monkeypatch):
"""Warn when a containerized local probe cannot see a GPU."""
monkeypatch.setattr(hardware, "_is_containerized", lambda: True)
result = {
"total_ram_gb": 7.7,
"available_ram_gb": 6.4,
"cpu_cores": 12,
"cpu_name": "Test CPU",
"has_gpu": False,
"gpu_name": None,
"gpu_vram_gb": None,
"gpu_count": 0,
"backend": "cpu_x86",
"gpu_error": None,
}
out = hardware._attach_probe_context(result, host="")
assert out["containerized"] is True
assert out["probe_scope"] == "container"
assert out["hardware_visibility_warning"]["code"] == "container_no_gpu_visible"
assert "manual_hardware" in out["hardware_visibility_warning"]["actions"]
@pytest.mark.area_services
@pytest.mark.area_unit
def test_native_no_gpu_does_not_get_container_warning(monkeypatch):
"""Do not warn for a native local probe that genuinely has no GPU."""
monkeypatch.setattr(hardware, "_is_containerized", lambda: False)
result = {
"total_ram_gb": 16,
"available_ram_gb": 10,
"cpu_cores": 12,
"cpu_name": "Test CPU",
"has_gpu": False,
"gpu_name": None,
"gpu_vram_gb": None,
"gpu_count": 0,
"backend": "cpu_x86",
"gpu_error": None,
}
out = hardware._attach_probe_context(result, host="")
assert out["containerized"] is False
assert out["probe_scope"] == "native"
assert "hardware_visibility_warning" not in out
@pytest.mark.area_services
@pytest.mark.area_unit
def test_remote_probe_does_not_get_local_container_warning(monkeypatch):
"""Do not apply local container warnings to remote hardware probes."""
monkeypatch.setattr(hardware, "_is_containerized", lambda: True)
result = {
"total_ram_gb": 16,
"available_ram_gb": 10,
"cpu_cores": 12,
"cpu_name": "Remote CPU",
"has_gpu": False,
"gpu_name": None,
"gpu_vram_gb": None,
"gpu_count": 0,
"backend": "cpu_x86",
"gpu_error": None,
}
out = hardware._attach_probe_context(result, host="user@example.com")
assert out["containerized"] is False
assert out["probe_scope"] == "remote"
assert "hardware_visibility_warning" not in out
@pytest.mark.area_services
@pytest.mark.area_unit
def test_gpu_driver_error_does_not_show_container_no_gpu_warning(monkeypatch):
"""Preserve GPU driver errors instead of replacing them with Docker warnings."""
monkeypatch.setattr(hardware, "_is_containerized", lambda: True)
result = {
"total_ram_gb": 16,
"available_ram_gb": 10,
"cpu_cores": 12,
"cpu_name": "Test CPU",
"has_gpu": False,
"gpu_name": None,
"gpu_vram_gb": None,
"gpu_count": 0,
"backend": "cpu_x86",
"gpu_error": "NVIDIA driver/library version mismatch",
}
out = hardware._attach_probe_context(result, host="")
assert out["containerized"] is True
assert out["probe_scope"] == "container"
assert "hardware_visibility_warning" not in out