From 674457384ab703a1aaa0bd4e4ddf2dea26445806 Mon Sep 17 00:00:00 2001 From: Karthik Rajesh Date: Mon, 15 Jun 2026 07:51:04 +0100 Subject: [PATCH] feat(cookbook): surface Docker hardware visibility warnings (#3658) --- services/hwfit/hardware.py | 90 ++++++++++++++ static/js/cookbook-hwfit.js | 75 ++++++++++++ static/style.css | 20 ++++ ...test_hwfit_container_visibility_warning.py | 110 ++++++++++++++++++ 4 files changed, 295 insertions(+) create mode 100644 tests/test_hwfit_container_visibility_warning.py diff --git a/services/hwfit/hardware.py b/services/hwfit/hardware.py index 47ec94d44..9d868f257 100644 --- a/services/hwfit/hardware.py +++ b/services/hwfit/hardware.py @@ -611,6 +611,93 @@ def _cache_key(host: str, ssh_port: str, platform_name: str): ) +def _is_containerized(): + """Best-effort check for whether the local Odysseus process is running in a container.""" + if _remote_host: + return False + + if os.path.exists("/.dockerenv"): + return True + + try: + with open("/proc/1/cgroup", encoding="utf-8", errors="replace") as f: + text = f.read().lower() + return any(marker in text for marker in ("docker", "containerd", "kubepods")) + except Exception: + return False + + +def _hardware_visibility_warning(result): + """Return a non-blocking UX warning when detected hardware may only be container-visible.""" + if not isinstance(result, dict): + return None + + if result.get("manual_hardware"): + return None + + if not result.get("containerized"): + return None + + if result.get("gpu_error"): + return None + + if not result.get("has_gpu"): + return { + "code": "container_no_gpu_visible", + "severity": "warning", + "title": "No GPU visible inside Docker", + "message": ( + "Cookbook is scanning hardware from inside the Odysseus container. " + "If your host has a GPU, Docker may not be exposing it to the container, " + "so model recommendations may be CPU-only or too conservative." + ), + "actions": [ + "manual_hardware", + "rescan", + "copy_diagnostics", + ], + } + + total_ram = result.get("total_ram_gb") or 0 + if total_ram and total_ram <= 8: + return { + "code": "container_low_ram_visible", + "severity": "info", + "title": "Container-visible RAM may be lower than host RAM", + "message": ( + "Cookbook is seeing the RAM available inside the container. " + "If your host has more memory, validate host RAM separately or use Manual Hardware." + ), + "actions": [ + "manual_hardware", + "rescan", + "copy_diagnostics", + ], + } + + return None + + +def _attach_probe_context(result, host=""): + """Attach probe-scope metadata and optional hardware visibility warning.""" + if not isinstance(result, dict) or result.get("error"): + return result + + is_remote = bool(host) + containerized = False if is_remote else _is_containerized() + + result["probe_scope"] = "remote" if is_remote else ("container" if containerized else "native") + result["containerized"] = containerized + + warning = _hardware_visibility_warning(result) + if warning: + result["hardware_visibility_warning"] = warning + else: + result.pop("hardware_visibility_warning", None) + + return result + + def detect_system(host="", ssh_port="", platform="", fresh=False): """Detect system hardware: RAM, CPU, GPU. Cached per host (hardware rarely changes, and probing a remote host over SSH is slow). Pass fresh=True to @@ -635,6 +722,7 @@ def detect_system(host="", ssh_port="", platform="", fresh=False): if _remote_platform == "windows" and _remote_host: result = _detect_windows() if result: + result = _attach_probe_context(result, host=host) _remote_host = None _remote_platform = None _cache_by_host[cache_key] = (now, result) @@ -653,6 +741,7 @@ def detect_system(host="", ssh_port="", platform="", fresh=False): if not _remote_host and os.name == "nt": result = _detect_windows() if result: + result = _attach_probe_context(result, host=host) _cache_by_host[cache_key] = (now, result) return result # PowerShell probe failed entirely — fall through to the generic path @@ -714,6 +803,7 @@ def detect_system(host="", ssh_port="", platform="", fresh=False): "gpu_error": _last_gpu_error, } + result = _attach_probe_context(result, host=host) _remote_host = None _remote_platform = None _cache_by_host[cache_key] = (now, result) diff --git a/static/js/cookbook-hwfit.js b/static/js/cookbook-hwfit.js index 29feb9279..33e695904 100644 --- a/static/js/cookbook-hwfit.js +++ b/static/js/cookbook-hwfit.js @@ -750,6 +750,80 @@ export async function _hwfitFetch(fresh = false) { } } +// Renders a non-blocking hardware visibility warning when Cookbook is using +// container-visible hardware that may not match the user's actual host machine. +function _renderHwVisibilityWarning(sys) { + const row = document.getElementById('hwfit-hw-row'); + if (!row) return; + + let box = document.getElementById('hwfit-hw-visibility-warning'); + + // Manual hardware is an explicit user override, so avoid showing stale + // container-detection warnings once the user has chosen a simulated profile. + const warning = sys?.manual_hardware ? null : sys?.hardware_visibility_warning; + + if (!warning) { + if (box) box.remove(); + return; + } + + if (!box) { + box = document.createElement('div'); + box.id = 'hwfit-hw-visibility-warning'; + box.className = 'hwfit-loading hwfit-hw-visibility-warning'; + row.insertAdjacentElement('afterend', box); + } + + box.innerHTML = ` +
${esc(warning.title || 'Hardware visibility note')}
+
${esc(warning.message || '')}
+
+ + + +
+ `; + + box.querySelector('[data-hw-action="manual"]')?.addEventListener('click', () => { + const panel = document.getElementById('hwfit-manual-panel'); + if (panel) panel.classList.remove('hidden'); + document.getElementById('hwfit-hw-manual-btn')?.scrollIntoView?.({ + behavior: 'smooth', + block: 'center', + }); + }); + + box.querySelector('[data-hw-action="rescan"]')?.addEventListener('click', () => { + _resetGpuToggleState(); + _hwfitCache = null; + _hwfitFetch(true); + }); + + box.querySelector('[data-hw-action="copy"]')?.addEventListener('click', () => { + // Keep diagnostics copy/paste friendly for GitHub issues and Docker support. + const text = [ + 'Odysseus Cookbook hardware diagnostics', + `probe_scope=${sys?.probe_scope || ''}`, + `containerized=${sys?.containerized === true}`, + `backend=${sys?.backend || ''}`, + `has_gpu=${sys?.has_gpu === true}`, + `gpu_name=${sys?.gpu_name || ''}`, + `gpu_count=${sys?.gpu_count || 0}`, + `gpu_vram_gb=${sys?.gpu_vram_gb || ''}`, + `ram=${sys?.available_ram_gb || '?'} / ${sys?.total_ram_gb || '?'} GB`, + `cpu_cores=${sys?.cpu_cores || ''}`, + `cpu_name=${sys?.cpu_name || ''}`, + '', + 'Useful checks:', + 'docker compose exec odysseus nvidia-smi -L', + 'docker compose exec odysseus cat /proc/meminfo | head', + 'docker compose exec odysseus python -c "from services.hwfit.hardware import detect_system; import json; print(json.dumps(detect_system(fresh=True), indent=2))"', + ].join('\n'); + + _copyText(text); + }); +} + export function _hwfitRenderHw(el, sys) { if (!el || !sys) return; // Cache system info globally so other modules can read VRAM without refetching @@ -838,6 +912,7 @@ export function _hwfitRenderHw(el, sys) { + chip('cores', cores) + chip('backend', esc(sys.backend || '')) + manualChip; + _renderHwVisibilityWarning(sys); // Body click → toggle "off" (dimmed, still visible). Membership of // _dismissedHwChips is what the ranker reads, so both add+remove // here also flips the model list. The manual chip is excluded — diff --git a/static/style.css b/static/style.css index 58241d997..3cfcba030 100644 --- a/static/style.css +++ b/static/style.css @@ -21246,6 +21246,26 @@ body.gallery-selecting .gallery-dl-btn, display: flex; align-items: center; justify-content: center; color: var(--fg-muted); padding: 16px 0; font-size: 12px; } +.hwfit-hw-visibility-warning { + display: flex; + flex-direction: column; + align-items: flex-start; + gap: 8px; + text-align: left; + margin-top: 8px; +} +.hwfit-hw-visibility-warning-title { + font-weight: 600; +} +.hwfit-hw-visibility-warning-body { + opacity: 0.78; + line-height: 1.45; +} +.hwfit-hw-visibility-warning-actions { + display: flex; + gap: 8px; + flex-wrap: wrap; +} .hwfit-row { display: flex; align-items: center; gap: 6px; padding: 5px 8px; border-radius: 6px; cursor: pointer; font-size: 11px; diff --git a/tests/test_hwfit_container_visibility_warning.py b/tests/test_hwfit_container_visibility_warning.py new file mode 100644 index 000000000..f9dab4ec9 --- /dev/null +++ b/tests/test_hwfit_container_visibility_warning.py @@ -0,0 +1,110 @@ +"""Tests for Cookbook hardware probe context and container visibility warnings.""" + +import pytest + +from services.hwfit import hardware + + +@pytest.mark.area_services +@pytest.mark.area_unit +def test_container_no_gpu_gets_visibility_warning(monkeypatch): + """Warn when a containerized local probe cannot see a GPU.""" + monkeypatch.setattr(hardware, "_is_containerized", lambda: True) + + result = { + "total_ram_gb": 7.7, + "available_ram_gb": 6.4, + "cpu_cores": 12, + "cpu_name": "Test CPU", + "has_gpu": False, + "gpu_name": None, + "gpu_vram_gb": None, + "gpu_count": 0, + "backend": "cpu_x86", + "gpu_error": None, + } + + out = hardware._attach_probe_context(result, host="") + + assert out["containerized"] is True + assert out["probe_scope"] == "container" + assert out["hardware_visibility_warning"]["code"] == "container_no_gpu_visible" + assert "manual_hardware" in out["hardware_visibility_warning"]["actions"] + + +@pytest.mark.area_services +@pytest.mark.area_unit +def test_native_no_gpu_does_not_get_container_warning(monkeypatch): + """Do not warn for a native local probe that genuinely has no GPU.""" + monkeypatch.setattr(hardware, "_is_containerized", lambda: False) + + result = { + "total_ram_gb": 16, + "available_ram_gb": 10, + "cpu_cores": 12, + "cpu_name": "Test CPU", + "has_gpu": False, + "gpu_name": None, + "gpu_vram_gb": None, + "gpu_count": 0, + "backend": "cpu_x86", + "gpu_error": None, + } + + out = hardware._attach_probe_context(result, host="") + + assert out["containerized"] is False + assert out["probe_scope"] == "native" + assert "hardware_visibility_warning" not in out + + +@pytest.mark.area_services +@pytest.mark.area_unit +def test_remote_probe_does_not_get_local_container_warning(monkeypatch): + """Do not apply local container warnings to remote hardware probes.""" + monkeypatch.setattr(hardware, "_is_containerized", lambda: True) + + result = { + "total_ram_gb": 16, + "available_ram_gb": 10, + "cpu_cores": 12, + "cpu_name": "Remote CPU", + "has_gpu": False, + "gpu_name": None, + "gpu_vram_gb": None, + "gpu_count": 0, + "backend": "cpu_x86", + "gpu_error": None, + } + + out = hardware._attach_probe_context(result, host="user@example.com") + + assert out["containerized"] is False + assert out["probe_scope"] == "remote" + assert "hardware_visibility_warning" not in out + + +@pytest.mark.area_services +@pytest.mark.area_unit +def test_gpu_driver_error_does_not_show_container_no_gpu_warning(monkeypatch): + """Preserve GPU driver errors instead of replacing them with Docker warnings.""" + monkeypatch.setattr(hardware, "_is_containerized", lambda: True) + + result = { + "total_ram_gb": 16, + "available_ram_gb": 10, + "cpu_cores": 12, + "cpu_name": "Test CPU", + "has_gpu": False, + "gpu_name": None, + "gpu_vram_gb": None, + "gpu_count": 0, + "backend": "cpu_x86", + "gpu_error": "NVIDIA driver/library version mismatch", + } + + out = hardware._attach_probe_context(result, host="") + + assert out["containerized"] is True + assert out["probe_scope"] == "container" + assert "hardware_visibility_warning" not in out