mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-15 17:25:26 -04:00
feat(cookbook): surface Docker hardware visibility warnings (#3658)
This commit is contained in:
@@ -611,6 +611,93 @@ def _cache_key(host: str, ssh_port: str, platform_name: str):
|
||||
)
|
||||
|
||||
|
||||
def _is_containerized():
|
||||
"""Best-effort check for whether the local Odysseus process is running in a container."""
|
||||
if _remote_host:
|
||||
return False
|
||||
|
||||
if os.path.exists("/.dockerenv"):
|
||||
return True
|
||||
|
||||
try:
|
||||
with open("/proc/1/cgroup", encoding="utf-8", errors="replace") as f:
|
||||
text = f.read().lower()
|
||||
return any(marker in text for marker in ("docker", "containerd", "kubepods"))
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _hardware_visibility_warning(result):
|
||||
"""Return a non-blocking UX warning when detected hardware may only be container-visible."""
|
||||
if not isinstance(result, dict):
|
||||
return None
|
||||
|
||||
if result.get("manual_hardware"):
|
||||
return None
|
||||
|
||||
if not result.get("containerized"):
|
||||
return None
|
||||
|
||||
if result.get("gpu_error"):
|
||||
return None
|
||||
|
||||
if not result.get("has_gpu"):
|
||||
return {
|
||||
"code": "container_no_gpu_visible",
|
||||
"severity": "warning",
|
||||
"title": "No GPU visible inside Docker",
|
||||
"message": (
|
||||
"Cookbook is scanning hardware from inside the Odysseus container. "
|
||||
"If your host has a GPU, Docker may not be exposing it to the container, "
|
||||
"so model recommendations may be CPU-only or too conservative."
|
||||
),
|
||||
"actions": [
|
||||
"manual_hardware",
|
||||
"rescan",
|
||||
"copy_diagnostics",
|
||||
],
|
||||
}
|
||||
|
||||
total_ram = result.get("total_ram_gb") or 0
|
||||
if total_ram and total_ram <= 8:
|
||||
return {
|
||||
"code": "container_low_ram_visible",
|
||||
"severity": "info",
|
||||
"title": "Container-visible RAM may be lower than host RAM",
|
||||
"message": (
|
||||
"Cookbook is seeing the RAM available inside the container. "
|
||||
"If your host has more memory, validate host RAM separately or use Manual Hardware."
|
||||
),
|
||||
"actions": [
|
||||
"manual_hardware",
|
||||
"rescan",
|
||||
"copy_diagnostics",
|
||||
],
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _attach_probe_context(result, host=""):
|
||||
"""Attach probe-scope metadata and optional hardware visibility warning."""
|
||||
if not isinstance(result, dict) or result.get("error"):
|
||||
return result
|
||||
|
||||
is_remote = bool(host)
|
||||
containerized = False if is_remote else _is_containerized()
|
||||
|
||||
result["probe_scope"] = "remote" if is_remote else ("container" if containerized else "native")
|
||||
result["containerized"] = containerized
|
||||
|
||||
warning = _hardware_visibility_warning(result)
|
||||
if warning:
|
||||
result["hardware_visibility_warning"] = warning
|
||||
else:
|
||||
result.pop("hardware_visibility_warning", None)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def detect_system(host="", ssh_port="", platform="", fresh=False):
|
||||
"""Detect system hardware: RAM, CPU, GPU. Cached per host (hardware rarely
|
||||
changes, and probing a remote host over SSH is slow). Pass fresh=True to
|
||||
@@ -635,6 +722,7 @@ def detect_system(host="", ssh_port="", platform="", fresh=False):
|
||||
if _remote_platform == "windows" and _remote_host:
|
||||
result = _detect_windows()
|
||||
if result:
|
||||
result = _attach_probe_context(result, host=host)
|
||||
_remote_host = None
|
||||
_remote_platform = None
|
||||
_cache_by_host[cache_key] = (now, result)
|
||||
@@ -653,6 +741,7 @@ def detect_system(host="", ssh_port="", platform="", fresh=False):
|
||||
if not _remote_host and os.name == "nt":
|
||||
result = _detect_windows()
|
||||
if result:
|
||||
result = _attach_probe_context(result, host=host)
|
||||
_cache_by_host[cache_key] = (now, result)
|
||||
return result
|
||||
# PowerShell probe failed entirely — fall through to the generic path
|
||||
@@ -714,6 +803,7 @@ def detect_system(host="", ssh_port="", platform="", fresh=False):
|
||||
"gpu_error": _last_gpu_error,
|
||||
}
|
||||
|
||||
result = _attach_probe_context(result, host=host)
|
||||
_remote_host = None
|
||||
_remote_platform = None
|
||||
_cache_by_host[cache_key] = (now, result)
|
||||
|
||||
@@ -750,6 +750,80 @@ export async function _hwfitFetch(fresh = false) {
|
||||
}
|
||||
}
|
||||
|
||||
// Renders a non-blocking hardware visibility warning when Cookbook is using
|
||||
// container-visible hardware that may not match the user's actual host machine.
|
||||
function _renderHwVisibilityWarning(sys) {
|
||||
const row = document.getElementById('hwfit-hw-row');
|
||||
if (!row) return;
|
||||
|
||||
let box = document.getElementById('hwfit-hw-visibility-warning');
|
||||
|
||||
// Manual hardware is an explicit user override, so avoid showing stale
|
||||
// container-detection warnings once the user has chosen a simulated profile.
|
||||
const warning = sys?.manual_hardware ? null : sys?.hardware_visibility_warning;
|
||||
|
||||
if (!warning) {
|
||||
if (box) box.remove();
|
||||
return;
|
||||
}
|
||||
|
||||
if (!box) {
|
||||
box = document.createElement('div');
|
||||
box.id = 'hwfit-hw-visibility-warning';
|
||||
box.className = 'hwfit-loading hwfit-hw-visibility-warning';
|
||||
row.insertAdjacentElement('afterend', box);
|
||||
}
|
||||
|
||||
box.innerHTML = `
|
||||
<div class="hwfit-hw-visibility-warning-title">${esc(warning.title || 'Hardware visibility note')}</div>
|
||||
<div class="hwfit-hw-visibility-warning-body">${esc(warning.message || '')}</div>
|
||||
<div class="hwfit-hw-visibility-warning-actions">
|
||||
<button type="button" class="hwfit-gpu-btn" data-hw-action="manual">Edit manual hardware</button>
|
||||
<button type="button" class="hwfit-gpu-btn" data-hw-action="rescan">Rescan</button>
|
||||
<button type="button" class="hwfit-gpu-btn" data-hw-action="copy">Copy diagnostics</button>
|
||||
</div>
|
||||
`;
|
||||
|
||||
box.querySelector('[data-hw-action="manual"]')?.addEventListener('click', () => {
|
||||
const panel = document.getElementById('hwfit-manual-panel');
|
||||
if (panel) panel.classList.remove('hidden');
|
||||
document.getElementById('hwfit-hw-manual-btn')?.scrollIntoView?.({
|
||||
behavior: 'smooth',
|
||||
block: 'center',
|
||||
});
|
||||
});
|
||||
|
||||
box.querySelector('[data-hw-action="rescan"]')?.addEventListener('click', () => {
|
||||
_resetGpuToggleState();
|
||||
_hwfitCache = null;
|
||||
_hwfitFetch(true);
|
||||
});
|
||||
|
||||
box.querySelector('[data-hw-action="copy"]')?.addEventListener('click', () => {
|
||||
// Keep diagnostics copy/paste friendly for GitHub issues and Docker support.
|
||||
const text = [
|
||||
'Odysseus Cookbook hardware diagnostics',
|
||||
`probe_scope=${sys?.probe_scope || ''}`,
|
||||
`containerized=${sys?.containerized === true}`,
|
||||
`backend=${sys?.backend || ''}`,
|
||||
`has_gpu=${sys?.has_gpu === true}`,
|
||||
`gpu_name=${sys?.gpu_name || ''}`,
|
||||
`gpu_count=${sys?.gpu_count || 0}`,
|
||||
`gpu_vram_gb=${sys?.gpu_vram_gb || ''}`,
|
||||
`ram=${sys?.available_ram_gb || '?'} / ${sys?.total_ram_gb || '?'} GB`,
|
||||
`cpu_cores=${sys?.cpu_cores || ''}`,
|
||||
`cpu_name=${sys?.cpu_name || ''}`,
|
||||
'',
|
||||
'Useful checks:',
|
||||
'docker compose exec odysseus nvidia-smi -L',
|
||||
'docker compose exec odysseus cat /proc/meminfo | head',
|
||||
'docker compose exec odysseus python -c "from services.hwfit.hardware import detect_system; import json; print(json.dumps(detect_system(fresh=True), indent=2))"',
|
||||
].join('\n');
|
||||
|
||||
_copyText(text);
|
||||
});
|
||||
}
|
||||
|
||||
export function _hwfitRenderHw(el, sys) {
|
||||
if (!el || !sys) return;
|
||||
// Cache system info globally so other modules can read VRAM without refetching
|
||||
@@ -838,6 +912,7 @@ export function _hwfitRenderHw(el, sys) {
|
||||
+ chip('cores', cores)
|
||||
+ chip('backend', esc(sys.backend || ''))
|
||||
+ manualChip;
|
||||
_renderHwVisibilityWarning(sys);
|
||||
// Body click → toggle "off" (dimmed, still visible). Membership of
|
||||
// _dismissedHwChips is what the ranker reads, so both add+remove
|
||||
// here also flips the model list. The manual chip is excluded —
|
||||
|
||||
@@ -21246,6 +21246,26 @@ body.gallery-selecting .gallery-dl-btn,
|
||||
display: flex; align-items: center; justify-content: center;
|
||||
color: var(--fg-muted); padding: 16px 0; font-size: 12px;
|
||||
}
|
||||
.hwfit-hw-visibility-warning {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
align-items: flex-start;
|
||||
gap: 8px;
|
||||
text-align: left;
|
||||
margin-top: 8px;
|
||||
}
|
||||
.hwfit-hw-visibility-warning-title {
|
||||
font-weight: 600;
|
||||
}
|
||||
.hwfit-hw-visibility-warning-body {
|
||||
opacity: 0.78;
|
||||
line-height: 1.45;
|
||||
}
|
||||
.hwfit-hw-visibility-warning-actions {
|
||||
display: flex;
|
||||
gap: 8px;
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
.hwfit-row {
|
||||
display: flex; align-items: center; gap: 6px; padding: 5px 8px;
|
||||
border-radius: 6px; cursor: pointer; font-size: 11px;
|
||||
|
||||
@@ -0,0 +1,110 @@
|
||||
"""Tests for Cookbook hardware probe context and container visibility warnings."""
|
||||
|
||||
import pytest
|
||||
|
||||
from services.hwfit import hardware
|
||||
|
||||
|
||||
@pytest.mark.area_services
|
||||
@pytest.mark.area_unit
|
||||
def test_container_no_gpu_gets_visibility_warning(monkeypatch):
|
||||
"""Warn when a containerized local probe cannot see a GPU."""
|
||||
monkeypatch.setattr(hardware, "_is_containerized", lambda: True)
|
||||
|
||||
result = {
|
||||
"total_ram_gb": 7.7,
|
||||
"available_ram_gb": 6.4,
|
||||
"cpu_cores": 12,
|
||||
"cpu_name": "Test CPU",
|
||||
"has_gpu": False,
|
||||
"gpu_name": None,
|
||||
"gpu_vram_gb": None,
|
||||
"gpu_count": 0,
|
||||
"backend": "cpu_x86",
|
||||
"gpu_error": None,
|
||||
}
|
||||
|
||||
out = hardware._attach_probe_context(result, host="")
|
||||
|
||||
assert out["containerized"] is True
|
||||
assert out["probe_scope"] == "container"
|
||||
assert out["hardware_visibility_warning"]["code"] == "container_no_gpu_visible"
|
||||
assert "manual_hardware" in out["hardware_visibility_warning"]["actions"]
|
||||
|
||||
|
||||
@pytest.mark.area_services
|
||||
@pytest.mark.area_unit
|
||||
def test_native_no_gpu_does_not_get_container_warning(monkeypatch):
|
||||
"""Do not warn for a native local probe that genuinely has no GPU."""
|
||||
monkeypatch.setattr(hardware, "_is_containerized", lambda: False)
|
||||
|
||||
result = {
|
||||
"total_ram_gb": 16,
|
||||
"available_ram_gb": 10,
|
||||
"cpu_cores": 12,
|
||||
"cpu_name": "Test CPU",
|
||||
"has_gpu": False,
|
||||
"gpu_name": None,
|
||||
"gpu_vram_gb": None,
|
||||
"gpu_count": 0,
|
||||
"backend": "cpu_x86",
|
||||
"gpu_error": None,
|
||||
}
|
||||
|
||||
out = hardware._attach_probe_context(result, host="")
|
||||
|
||||
assert out["containerized"] is False
|
||||
assert out["probe_scope"] == "native"
|
||||
assert "hardware_visibility_warning" not in out
|
||||
|
||||
|
||||
@pytest.mark.area_services
|
||||
@pytest.mark.area_unit
|
||||
def test_remote_probe_does_not_get_local_container_warning(monkeypatch):
|
||||
"""Do not apply local container warnings to remote hardware probes."""
|
||||
monkeypatch.setattr(hardware, "_is_containerized", lambda: True)
|
||||
|
||||
result = {
|
||||
"total_ram_gb": 16,
|
||||
"available_ram_gb": 10,
|
||||
"cpu_cores": 12,
|
||||
"cpu_name": "Remote CPU",
|
||||
"has_gpu": False,
|
||||
"gpu_name": None,
|
||||
"gpu_vram_gb": None,
|
||||
"gpu_count": 0,
|
||||
"backend": "cpu_x86",
|
||||
"gpu_error": None,
|
||||
}
|
||||
|
||||
out = hardware._attach_probe_context(result, host="user@example.com")
|
||||
|
||||
assert out["containerized"] is False
|
||||
assert out["probe_scope"] == "remote"
|
||||
assert "hardware_visibility_warning" not in out
|
||||
|
||||
|
||||
@pytest.mark.area_services
|
||||
@pytest.mark.area_unit
|
||||
def test_gpu_driver_error_does_not_show_container_no_gpu_warning(monkeypatch):
|
||||
"""Preserve GPU driver errors instead of replacing them with Docker warnings."""
|
||||
monkeypatch.setattr(hardware, "_is_containerized", lambda: True)
|
||||
|
||||
result = {
|
||||
"total_ram_gb": 16,
|
||||
"available_ram_gb": 10,
|
||||
"cpu_cores": 12,
|
||||
"cpu_name": "Test CPU",
|
||||
"has_gpu": False,
|
||||
"gpu_name": None,
|
||||
"gpu_vram_gb": None,
|
||||
"gpu_count": 0,
|
||||
"backend": "cpu_x86",
|
||||
"gpu_error": "NVIDIA driver/library version mismatch",
|
||||
}
|
||||
|
||||
out = hardware._attach_probe_context(result, host="")
|
||||
|
||||
assert out["containerized"] is True
|
||||
assert out["probe_scope"] == "container"
|
||||
assert "hardware_visibility_warning" not in out
|
||||
Reference in New Issue
Block a user