fix(security): redact credential-bearing URLs and PII from logs (#4750)

* fix(security): redact credential-bearing URLs and PII from logs Several log statements emitted sensitive data in clear text: - model_routes / chat_routes / contacts_routes logged endpoint URLs raw. Admin-configured URLs can embed credentials in userinfo or query (e.g. https://user:pass@host, ?api_key=...). Route them through a shared core.log_safety.redact_url() that drops userinfo/query/fragment. - note_routes / task_scheduler logged operator email addresses (smtp_user, recipient). Replaced with presence booleans, which keeps the diagnostic ("why didn't this send") without writing PII to logs. model_routes already had a local redactor on its HTTPStatusError branch; the generic except branch was missed, so reuse the existing helper there. Clears CodeQL py/clear-text-logging-sensitive-data alerts 264, 317, 324, 325, 343, 344, 528. * fix(security): re-bracket IPv6 hosts and single-source the URL redactor Address review on #4750: - redact_url now re-brackets IPv6 literals so host:port stays unambiguous (https://[2001:db8::1]:8443/v1, not the bracket-less ambiguous form). - point model_routes._redact_url_for_log at the shared helper so the two redactors are single-sourced (also picks up the IPv6 fix).
2026-06-22 20:55:29 -04:00 · 2026-06-22 14:12:39 -07:00
parent 2f246c7779
commit 7e5db9a3c6
7 changed files with 73 additions and 22 deletions
@@ -0,0 +1,27 @@
+"""Helpers for keeping sensitive data out of logs.
+
+Endpoint URLs configured by admins can embed credentials in the userinfo
+(``https://user:pass@host``) or query string (``?api_key=...``). Logging them
+raw leaks those secrets, so route/diagnostic logs run URLs through
+``redact_url`` first. Reconstructing the URL without userinfo/query/fragment
+also doubles as a sanitizer barrier for CodeQL's clear-text-logging query.
+"""
+
+from urllib.parse import urlparse, urlunparse
+
+
+def redact_url(url: str) -> str:
+    """Return a URL safe for logs by removing userinfo and query/fragment.
+
+    Keeps scheme, host, port and path so logs stay useful for debugging.
+    """
+    try:
+        parsed = urlparse(url or "")
+        host = parsed.hostname or ""
+        if ":" in host:  # IPv6 literal — re-bracket so host:port stays unambiguous
+            host = f"[{host}]"
+        if parsed.port:
+            host = f"{host}:{parsed.port}"
+        return urlunparse((parsed.scheme, host, parsed.path, "", "", ""))
+    except Exception:
+        return "<endpoint>"
@@ -29,6 +29,7 @@ from routes.document_helpers import _owner_session_filter
 from core.database import SessionLocal, get_session_mode, set_session_mode
 from core.database import Session as DBSession, ChatMessage as DBChatMessage
 from core.database import Document as DBDocument, ModelEndpoint
+from core.log_safety import redact_url
 from routes.research_routes import _resolve_research_endpoint
 from routes.model_routes import _visible_models
 from routes.chat_helpers import (
@@ -930,7 +931,7 @@ def setup_chat_routes(
            if effective_do_research:
                _r_ep, _r_model, _r_headers = _resolve_research_endpoint(sess)
                _auth_keys = list(_r_headers.keys()) if _r_headers else []
-                logger.info(f"Research endpoint resolved: model={_r_model}, endpoint={_r_ep}, auth_keys={_auth_keys}, sess_headers_keys={list(sess.headers.keys()) if isinstance(sess.headers, dict) else type(sess.headers)}")
+                logger.info(f"Research endpoint resolved: model={_r_model}, endpoint={redact_url(_r_ep)}, auth_keys={_auth_keys}, sess_headers_keys={list(sess.headers.keys()) if isinstance(sess.headers, dict) else type(sess.headers)}")

                # Clarification round: only for very short/vague queries on first research message.
                # Skip in compare mode — each pane is a fresh session, so every one would
@@ -18,6 +18,7 @@ from pathlib import Path
 from datetime import datetime
 from urllib.parse import urljoin, urlparse, urlunparse

+from core.log_safety import redact_url
 from fastapi import APIRouter, Query, Depends, Response, HTTPException
 from typing import List, Dict, Optional

@@ -702,7 +703,7 @@ def _delete_contact(uid: str) -> bool:
                logger.warning(
                    f"CardDAV DELETE reported success for {uid} "
                    f"but UID still present after re-fetch — "
-                    f"resource URL may differ from {url}"
+                    f"resource URL may differ from {redact_url(url)}"
                )
                return False
            if r.status_code == 404:
@@ -17,6 +17,7 @@ from fastapi import APIRouter, HTTPException, Form, Query, Body, Request, Respon
 from pydantic import BaseModel
 from fastapi.responses import StreamingResponse
 from core.database import SessionLocal, ModelEndpoint, Session as DbSession
+from core.log_safety import redact_url as _redact_url_for_log
 from core.middleware import require_admin
 from src.llm_core import _detect_provider, _host_match, ANTHROPIC_MODELS
 from src.tls_overrides import llm_verify
@@ -582,18 +583,6 @@ def _safe_build_headers(api_key: Optional[str], base_url: str) -> dict:
        return {"Authorization": f"Bearer {api_key}"} if api_key else {}


-def _redact_url_for_log(url: str) -> str:
-    """Return a URL safe for logs by removing userinfo and query/fragment."""
-    try:
-        parsed = urlparse(url or "")
-        host = parsed.hostname or ""
-        if parsed.port:
-            host = f"{host}:{parsed.port}"
-        return urlunparse((parsed.scheme, host, parsed.path, "", "", ""))
-    except Exception:
-        return "<endpoint>"
-
-
 def _is_discovery_only_provider(provider: str) -> bool:
    return provider == "chatgpt-subscription"

@@ -810,9 +799,9 @@ def _probe_endpoint(base_url: str, api_key: str = None, timeout: int = 5) -> Lis
        logger.warning("Failed to probe %s: %s", _redact_url_for_log(url), e)
    except Exception as e:
        if api_key:
-            logger.warning(f"Failed to probe {url} with API key: {e}")
+            logger.warning("Failed to probe %s with API key: %s", _redact_url_for_log(url), e)
            return []
-        logger.warning(f"Failed to probe {url}: {e}")
+        logger.warning("Failed to probe %s: %s", _redact_url_for_log(url), e)

    # Older Ollama builds and some proxies expose native /api/tags even when
    # the OpenAI-compatible /v1/models path is unavailable.
@@ -335,10 +335,11 @@ async def dispatch_reminder(
            # Loud diagnostic so we can see WHY a reminder didn't send (the
            # previous "silently no-op when cfg has no smtp_host" was invisible).
            logger.info(
-                f"dispatch_reminder[email] note_id={note_id} owner={owner!r} "
-                f"smtp_host={cfg.get('smtp_host')!r} smtp_user={cfg.get('smtp_user')!r} "
-                f"from={from_addr!r} recipient={recipient!r} "
-                f"account_name={cfg.get('account_name')!r}"
+                "dispatch_reminder[email] note_id=%s owner=%r "
+                "has_smtp_host=%s has_smtp_user=%s has_from=%s has_recipient=%s",
+                note_id, owner,
+                bool(cfg.get("smtp_host")), bool(cfg.get("smtp_user")),
+                bool(from_addr), bool(recipient),
            )
            missing = []
            if not cfg.get("smtp_host"):
@@ -1667,7 +1667,7 @@ class TaskScheduler:
            msg["X-Odysseus-Ref"] = str(task.id)
            msg.set_content(result or "")
            _send_smtp_message(cfg, from_addr, [to_addr], msg.as_string(), timeout=30)
-            logger.info("Task %s emailed result to %s (%sb)", task.id, to_addr, len(result or ""))
+            logger.info("Task %s emailed result (recipient_set=%s, %sb)", task.id, bool(to_addr), len(result or ""))
        except Exception as e:
            logger.error("Task %s email delivery failed: %s", task.id, e, exc_info=True)
            raise
@@ -2029,7 +2029,7 @@ class TaskScheduler:
                # silent SMTP failure is easier to spot in the logs.
                logger.info(
                    f"Task {task.id} delivered via MCP tool {tool_name} "
-                    f"(to={recipient or '<unset>'}, body={body_len}b, reply={stdout[:200]!r})"
+                    f"(recipient_set={bool(recipient)}, body={body_len}b, reply={stdout[:200]!r})"
                )
        except Exception as e:
            logger.error(f"Task {task.id} MCP delivery failed: {e}")
@@ -0,0 +1,32 @@
+from core.log_safety import redact_url
+
+
+def test_strips_userinfo():
+    assert redact_url("https://user:pass@host.example/v1/models") == "https://host.example/v1/models"
+
+
+def test_strips_query_and_fragment():
+    assert redact_url("https://host.example/v1?api_key=secret#frag") == "https://host.example/v1"
+
+
+def test_keeps_port_and_path():
+    assert redact_url("http://host.example:8080/api/tags") == "http://host.example:8080/api/tags"
+
+
+def test_ipv6_host_keeps_brackets():
+    assert redact_url("https://user:pass@[2001:db8::1]:8443/v1") == "https://[2001:db8::1]:8443/v1"
+    assert redact_url("https://[2001:db8::1]/v1") == "https://[2001:db8::1]/v1"
+
+
+def test_no_credentials_passthrough():
+    assert redact_url("https://host.example/v1/models") == "https://host.example/v1/models"
+
+
+def test_empty_and_none():
+    assert redact_url("") == ""
+    assert redact_url(None) == ""
+
+
+def test_garbage_does_not_raise():
+    # urlparse is lenient; just assert no credential-looking userinfo survives.
+    assert "@" not in redact_url("::::not a url::::")