odysseus/src/service_health.py

"""Consolidated service health / degraded-state reporting.

ROADMAP: "Better degraded-state reporting for ChromaDB, SearXNG, email, ntfy,
and provider probes." There was no single readout of which subsystems are
actually working — `/api/health` is only a liveness ping and each subsystem's
signal lives in a different module. This collects them into one uniform,
*non-intrusive* report (no test push is sent, no real search is run), so the
admin endpoint built on top of it is safe to poll.

Each probe returns:

    {"name": str, "status": "ok"|"degraded"|"down"|"disabled",
     "detail": str, "meta": dict}

- ok        — reachable / working
- degraded  — partially working (one of several components down)
- down      — configured & enabled but unreachable / erroring
- disabled  — not configured or turned off (not counted as a failure)

Design notes (driven by review feedback):

- **Bounded wall-clock.** Per-item probes (providers, email accounts) fan out
  across a bounded thread pool with a hard total budget (`_FANOUT_BUDGET`);
  stragglers are reported as a controlled `timeout` rather than blocking. The
  aggregate adds a per-subsystem deadline (`_SUBSYSTEM_DEADLINE`) and an overall
  ceiling (`_AGGREGATE_DEADLINE`), so the endpoint cannot hang regardless of how
  many endpoints/accounts are configured or how slowly they respond.
- **No secret leakage.** Even though the endpoint is admin-only, the response
  never returns credential-bearing URLs or raw exception text: URLs are passed
  through `_safe_url` (userinfo / query / fragment stripped) and failures are
  mapped to controlled categories via `_classify_error`.

The probe functions take their inputs as parameters (settings dict, account
list, endpoint list, manager objects) and isolate the network call to
``_http_get`` / injected callables, so they unit-test without touching the
network.
"""

import asyncio
import concurrent.futures
import logging
import socket
import ssl
import time
from typing import Any, Callable, Dict, List, Optional
from urllib.parse import urlparse

logger = logging.getLogger(__name__)

# Status ordering for rolling up an overall verdict. "disabled" is excluded —
# a turned-off feature must never drag the overall status down.
_SEVERITY = {"ok": 0, "degraded": 1, "down": 2}

OK = "ok"
DEGRADED = "degraded"
DOWN = "down"
DISABLED = "disabled"

# Timing budgets (seconds). _PROBE_TIMEOUT bounds a single network op;
# _FANOUT_BUDGET bounds a whole fan-out (providers/email) regardless of count;
# the aggregate layer adds a per-subsystem deadline and an overall ceiling.
_PROBE_TIMEOUT = 4
_PROBE_CONCURRENCY = 8
_FANOUT_BUDGET = 8
_SUBSYSTEM_DEADLINE = 10
_AGGREGATE_DEADLINE = 14

# Controlled, secret-free phrasing for each failure category.
_ERROR_DETAIL = {
    "timeout": "probe timed out",
    "connection_refused": "connection refused",
    "dns_error": "host could not be resolved",
    "tls_error": "TLS handshake failed",
    "network_error": "network error",
    "http_error": "server returned an error response",
    "auth_or_protocol_error": "authentication or protocol error",
    "no_models": "endpoint returned no models",
    "no_host": "no host configured",
    "error": "probe failed",
}


def _svc(name: str, status: str, detail: str, **meta: Any) -> Dict[str, Any]:
    return {"name": name, "status": status, "detail": detail, "meta": dict(meta)}


def _safe_url(url: Optional[str]) -> str:
    """Strip credentials (userinfo), query, and fragment from a URL.

    Keeps scheme / host / port / path so the report is still useful, but never
    echoes `user:pass@`, `?api_key=…`, or `#…` back to the caller. Returns
    "<redacted>" if the URL can't be parsed into at least a host.
    """
    if not url:
        return ""
    raw = url.strip()
    try:
        p = urlparse(raw if "://" in raw else "//" + raw)
        host = p.hostname or ""
        if not host:
            return "<redacted>"
        netloc = f"{host}:{p.port}" if p.port else host
        path = (p.path or "").rstrip("/")
        scheme = f"{p.scheme}://" if p.scheme else ""
        return f"{scheme}{netloc}{path}"
    except Exception:
        return "<redacted>"


def _classify_error(exc: BaseException) -> str:
    """Map an exception to a controlled, secret-free category token.

    Never returns `str(exc)` — httpx/imaplib exception text can embed the target
    URL (which may carry credentials) or server-supplied detail.
    """
    if isinstance(exc, (asyncio.TimeoutError, concurrent.futures.TimeoutError,
                        TimeoutError, socket.timeout)):
        return "timeout"
    name = type(exc).__name__
    mod = (type(exc).__module__ or "")
    if isinstance(exc, ssl.SSLError) or "SSL" in name or "Certificate" in name:
        return "tls_error"
    if isinstance(exc, socket.gaierror) or name in ("gaierror", "herror"):
        return "dns_error"
    if isinstance(exc, ConnectionRefusedError) or "ConnectionRefused" in name \
            or name in ("ConnectError",):
        return "connection_refused"
    if "Timeout" in name:
        return "timeout"
    if mod.startswith("imaplib") or name in ("error", "abort", "readonly"):
        return "auth_or_protocol_error"
    if name == "HTTPStatusError":
        return "http_error"
    if name in ("ConnectTimeout", "ReadTimeout", "ReadError", "WriteError",
                "PoolTimeout", "RemoteProtocolError", "NetworkError",
                "ProxyError", "ProtocolError"):
        return "network_error"
    if isinstance(exc, OSError):
        return "network_error"
    return "error"


def _detail_for(category: str) -> str:
    return _ERROR_DETAIL.get(category, _ERROR_DETAIL["error"])


def _http_get(url: str, timeout: float = _PROBE_TIMEOUT):
    """Single network entry point for the HTTP probes (monkeypatched in tests)."""
    import httpx
    return httpx.get(url, timeout=timeout)


def _bounded_map(items: List[Any], worker: Callable[[int, Any], Dict[str, Any]],
                 *, budget: float = _FANOUT_BUDGET,
                 concurrency: int = _PROBE_CONCURRENCY) -> List[Optional[Dict[str, Any]]]:
    """Run ``worker(index, item)`` across a bounded thread pool, in order.

    `worker` must catch its own exceptions and return a per-item dict. Any item
    not finished within `budget` seconds *in total* is left as ``None`` (the
    caller substitutes a controlled `timeout` entry). The pool is shut down with
    ``wait=False`` so stragglers never block the response — their own per-op
    timeout reaps them shortly after.
    """
    n = len(items)
    out: List[Optional[Dict[str, Any]]] = [None] * n
    if n == 0:
        return out
    ex = concurrent.futures.ThreadPoolExecutor(max_workers=max(1, min(concurrency, n)))
    futures = {ex.submit(worker, i, items[i]): i for i in range(n)}
    try:
        for fut in concurrent.futures.as_completed(futures, timeout=budget):
            i = futures[fut]
            try:
                out[i] = fut.result()
            except Exception as e:  # worker is expected to handle its own errors
                out[i] = {"ok": False, "error": _classify_error(e)}
    except concurrent.futures.TimeoutError:
        pass  # unfinished items stay None → marked timeout by the caller
    finally:
        ex.shutdown(wait=False, cancel_futures=True)
    return out


# ── ChromaDB (vector RAG + vector memory) ──

def chromadb_health(rag_manager: Any, memory_vector: Any) -> Dict[str, Any]:
    """Report on the two ChromaDB-backed stores via their `.healthy` flags.

    Both absent  → disabled (Chroma/embeddings not installed or off).
    Both healthy → ok. One down → degraded. Both present but unhealthy → down.
    """
    rag_present = rag_manager is not None
    mem_present = memory_vector is not None
    if not rag_present and not mem_present:
        return _svc("chromadb", DISABLED,
                    "Vector RAG and vector memory are not initialized.",
                    rag=None, memory=None)

    rag_ok = bool(rag_present and getattr(rag_manager, "healthy", False))
    mem_ok = bool(mem_present and getattr(memory_vector, "healthy", False))
    meta = {"rag": rag_ok if rag_present else None,
            "memory": mem_ok if mem_present else None}

    healthy = [ok for ok in (rag_ok if rag_present else None,
                             mem_ok if mem_present else None) if ok is not None]
    if healthy and all(healthy):
        return _svc("chromadb", OK, "Vector stores healthy.", **meta)
    if any(healthy):
        return _svc("chromadb", DEGRADED,
                    "One vector store is unavailable.", **meta)
    return _svc("chromadb", DOWN, "Vector stores are unavailable.", **meta)


# ── SearXNG ──

def _searxng_instance(settings: Dict[str, Any]) -> str:
    """Mirror src/search/providers.py:_get_search_instance precedence."""
    url = (settings.get("search_url") or "").strip()
    if url:
        return url.rstrip("/")
    from src.constants import SEARXNG_INSTANCE
    return SEARXNG_INSTANCE.rstrip("/")


def searxng_health(settings: Dict[str, Any],
                   *, http_get: Callable = _http_get) -> Dict[str, Any]:
    """Non-intrusive reachability probe for the configured SearXNG instance.

    Tries `/healthz` (2xx), falling back to the instance root (any non-5xx means
    the host answered). No search query is run. The configured instance is
    probed in full, but only its sanitized form is returned in `meta`.
    """
    provider = (settings.get("search_provider") or "searxng")
    if provider != "searxng":
        return _svc("searxng", DISABLED,
                    f"Search provider is '{provider}', not SearXNG.",
                    provider=provider)
    instance = _searxng_instance(settings)
    if not instance:
        return _svc("searxng", DISABLED, "No SearXNG instance configured.")
    safe_instance = _safe_url(instance)
    last_category = "error"
    for path, accept in (("/healthz", lambda c: 200 <= c < 300),
                         ("/", lambda c: 0 < c < 500)):
        try:
            r = http_get(instance + path, timeout=_PROBE_TIMEOUT)
            code = getattr(r, "status_code", 0)
            if accept(code):
                return _svc("searxng", OK, f"Reachable (HTTP {code}).",
                            instance=safe_instance, probed=path, http_status=code)
            last_category = "http_error"
        except Exception as e:  # connection refused, DNS, timeout, …
            last_category = _classify_error(e)
    return _svc("searxng", DOWN, f"Unreachable ({_detail_for(last_category)}).",
                instance=safe_instance, error=last_category)


# ── ntfy ──

def _ntfy_integration(integrations: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
    """First enabled ntfy integration with a base_url (matches note_routes)."""
    for i in integrations or []:
        if (i.get("preset") == "ntfy" and i.get("enabled", True)
                and i.get("base_url")):
            return i
    return None


def ntfy_health(integrations: List[Dict[str, Any]], settings: Dict[str, Any],
                *, http_get: Callable = _http_get) -> Dict[str, Any]:
    """Non-intrusive ntfy probe via the server's built-in `/v1/health` route.

    No test notification is POSTed — `/v1/health` returns `{"healthy":true}`
    without publishing to a topic. The request keeps whatever credentials the
    configured base_url carries, but `meta.base` is sanitized.
    """
    channel = settings.get("reminder_channel") or "browser"
    intg = _ntfy_integration(integrations)
    if not intg:
        return _svc("ntfy", DISABLED, "No ntfy integration configured.",
                    reminder_channel=channel)
    raw = (intg.get("base_url") or "").strip()
    parsed = urlparse(raw)
    probe_base = (f"{parsed.scheme}://{parsed.netloc}"
                  if parsed.scheme and parsed.netloc else raw.rstrip("/"))
    safe_base = _safe_url(raw)
    try:
        r = http_get(probe_base + "/v1/health", timeout=_PROBE_TIMEOUT)
        code = getattr(r, "status_code", 0)
        if code and code < 500:
            return _svc("ntfy", OK, f"Reachable (HTTP {code}).",
                        base=safe_base, reminder_channel=channel, http_status=code)
        return _svc("ntfy", DOWN, "Server returned an error response.",
                    base=safe_base, reminder_channel=channel, error="http_error")
    except Exception as e:
        category = _classify_error(e)
        return _svc("ntfy", DOWN, f"Unreachable ({_detail_for(category)}).",
                    base=safe_base, reminder_channel=channel, error=category)


# ── Email (IMAP) ──

def email_health(accounts: List[Dict[str, Any]],
                 *, connect: Optional[Callable] = None) -> Dict[str, Any]:
    """Try a short IMAP connect+logout per configured account, concurrently.

    All connect → ok. Some fail → degraded. All fail → down. No account
    configured → disabled. Bounded by `_FANOUT_BUDGET` regardless of count.
    `meta` carries only the account label and a controlled error category —
    never credentials or raw exception text.
    """
    if not accounts:
        return _svc("email", DISABLED, "No email accounts configured.")
    if connect is None:
        from routes.email_helpers import _imap_connect
        # Impose the service-health budget on the IMAP connect itself.
        connect = lambda aid: _imap_connect(aid, timeout=_PROBE_TIMEOUT)  # noqa: E731

    def _label(acc: Dict[str, Any]) -> str:
        return acc.get("account_name") or acc.get("account_id") or "account"

    def _check(_i: int, acc: Dict[str, Any]) -> Dict[str, Any]:
        name = _label(acc)
        if not (acc.get("imap_host") or ""):
            return {"name": name, "ok": False, "error": "no_host"}
        try:
            conn = connect(acc.get("account_id"))
            try:
                conn.logout()
            except Exception:
                pass
            return {"name": name, "ok": True, "error": None}
        except Exception as e:
            return {"name": name, "ok": False, "error": _classify_error(e)}

    raw = _bounded_map(accounts, _check, budget=_FANOUT_BUDGET,
                       concurrency=_PROBE_CONCURRENCY)
    per_account = [r if r is not None
                   else {"name": _label(accounts[i]), "ok": False, "error": "timeout"}
                   for i, r in enumerate(raw)]
    return _rollup_items("email", "mailbox(es)", per_account)


# ── Provider endpoints ──

def providers_health(endpoints: List[Dict[str, Any]],
                     *, probe: Optional[Callable] = None) -> Dict[str, Any]:
    """Probe each enabled model endpoint's model list, concurrently.

    `endpoints` is a list of plain dicts ({name, base_url, api_key}) so this
    stays decoupled from the ORM and trivially testable. Non-empty model list
    → reachable. Bounded by `_FANOUT_BUDGET` regardless of count. `meta` never
    contains api_key or raw URLs — only a display name (or a sanitized URL when
    no name is set) and a controlled error category.
    """
    if not endpoints:
        return _svc("providers", DISABLED, "No model endpoints configured.")
    if probe is None:
        from routes.model_routes import _probe_endpoint as probe

    def _label(ep: Dict[str, Any]) -> str:
        return ep.get("name") or _safe_url(ep.get("base_url")) or "endpoint"

    def _check(_i: int, ep: Dict[str, Any]) -> Dict[str, Any]:
        name = _label(ep)
        try:
            models = probe(ep.get("base_url"), ep.get("api_key"),
                           timeout=_PROBE_TIMEOUT) or []
        except Exception as e:
            return {"name": name, "ok": False, "model_count": 0,
                    "error": _classify_error(e)}
        count = len(models)
        return {"name": name, "ok": bool(count), "model_count": count,
                "error": None if count else "no_models"}

    raw = _bounded_map(endpoints, _check, budget=_FANOUT_BUDGET,
                       concurrency=_PROBE_CONCURRENCY)
    per_endpoint = [r if r is not None
                    else {"name": _label(endpoints[i]), "ok": False,
                          "model_count": 0, "error": "timeout"}
                    for i, r in enumerate(raw)]
    return _rollup_items("providers", "endpoint(s)", per_endpoint, key="endpoints")


def _rollup_items(name: str, noun: str, items: List[Dict[str, Any]],
                  key: str = "accounts") -> Dict[str, Any]:
    """Shared ok/degraded/down rollup for a list of per-item probe results."""
    total = len(items)
    ok_count = sum(1 for it in items if it.get("ok"))
    if ok_count == total:
        status, detail = OK, f"{ok_count}/{total} {noun} reachable."
    elif ok_count == 0:
        status, detail = DOWN, f"No {noun} reachable."
    else:
        status, detail = DEGRADED, f"{ok_count}/{total} {noun} reachable."
    return _svc(name, status, detail, **{key: items})


# ── Aggregate ──

def _rollup(services: List[Dict[str, Any]]) -> str:
    worst = OK
    for s in services:
        sev = _SEVERITY.get(s.get("status"))
        if sev is not None and sev > _SEVERITY[worst]:
            worst = s["status"]
    return worst


def _gather_inputs() -> Dict[str, Any]:
    """Pull live config/account/endpoint lists from the app's data sources.

    Each lookup fails soft: a broken source yields an empty/neutral value so a
    single failure can't take down the whole health report.
    """
    settings: Dict[str, Any] = {}
    integrations: List[Dict[str, Any]] = []
    accounts: List[Dict[str, Any]] = []
    endpoints: List[Dict[str, Any]] = []
    try:
        from src.settings import load_settings
        settings = load_settings() or {}
    except Exception as e:
        logger.debug(f"service_health: settings load failed: {e}")
    try:
        from src.integrations import load_integrations
        integrations = load_integrations() or []
    except Exception as e:
        logger.debug(f"service_health: integrations load failed: {e}")
    try:
        from routes.email_helpers import _list_email_accounts
        accounts = _list_email_accounts() or []
    except Exception as e:
        logger.debug(f"service_health: email accounts load failed: {e}")
    try:
        from core.database import SessionLocal, ModelEndpoint
        db = SessionLocal()
        try:
            rows = db.query(ModelEndpoint).filter(
                ModelEndpoint.is_enabled == True).all()  # noqa: E712
            endpoints = [{"name": r.name, "base_url": r.base_url,
                          "api_key": r.api_key} for r in rows]
        finally:
            db.close()
    except Exception as e:
        logger.debug(f"service_health: endpoint load failed: {e}")
    return {"settings": settings, "integrations": integrations,
            "accounts": accounts, "endpoints": endpoints}


async def _run_subsystem(name: str, fn: Callable, *args: Any) -> Dict[str, Any]:
    """Run one (sync) subsystem probe in a thread under a hard deadline.

    A subsystem that overruns `_SUBSYSTEM_DEADLINE` (or raises) becomes a
    controlled `down`/`timeout` entry instead of hanging or leaking the error.
    """
    try:
        return await asyncio.wait_for(asyncio.to_thread(fn, *args),
                                      timeout=_SUBSYSTEM_DEADLINE)
    except asyncio.TimeoutError:
        return _svc(name, DOWN, _detail_for("timeout"), error="timeout")
    except Exception as e:
        category = _classify_error(e)
        return _svc(name, DOWN, _detail_for(category), error=category)


async def collect_service_health(rag_manager: Any = None,
                                 memory_vector: Any = None) -> Dict[str, Any]:
    """Run every probe and return {overall, services, timestamp}.

    Bounded end-to-end: in-process ChromaDB flags are read synchronously; the
    four network subsystems run concurrently, each under `_SUBSYSTEM_DEADLINE`,
    with an overall `_AGGREGATE_DEADLINE` backstop. Per-item probes inside
    providers/email are themselves bounded by `_FANOUT_BUDGET`.
    """
    from datetime import datetime, timezone

    inputs = _gather_inputs()
    settings = inputs["settings"]

    # ChromaDB is in-process and synchronous (just reads flags).
    chroma = chromadb_health(rag_manager, memory_vector)

    names = ["searxng", "ntfy", "email", "providers"]
    coros = [
        _run_subsystem("searxng", searxng_health, settings),
        _run_subsystem("ntfy", ntfy_health, inputs["integrations"], settings),
        _run_subsystem("email", email_health, inputs["accounts"]),
        _run_subsystem("providers", providers_health, inputs["endpoints"]),
    ]
    try:
        results = await asyncio.wait_for(asyncio.gather(*coros),
                                         timeout=_AGGREGATE_DEADLINE)
    except asyncio.TimeoutError:
        # Hard backstop — should not normally fire given per-subsystem deadlines.
        results = [_svc(n, DOWN, _detail_for("timeout"), error="timeout")
                   for n in names]

    services = [chroma, *results]
    return {
        "overall": _rollup(services),
        "services": services,
        # Timezone-aware UTC (…+00:00). Avoids the deprecated naive
        # datetime.utcnow() flagged in review (overlaps with #1116).
        "timestamp": datetime.now(timezone.utc).isoformat(),
    }