From 7e5db9a3c6346caaabf1f0f49bab4be39be2ffe7 Mon Sep 17 00:00:00 2001
From: nopoz <bill.lowney@gmail.com>
Date: Mon, 22 Jun 2026 14:12:39 -0700
Subject: [PATCH] fix(security): redact credential-bearing URLs and PII from
 logs (#4750)

* fix(security): redact credential-bearing URLs and PII from logs

Several log statements emitted sensitive data in clear text:

- model_routes / chat_routes / contacts_routes logged endpoint URLs raw.
  Admin-configured URLs can embed credentials in userinfo or query
  (e.g. https://user:pass@host, ?api_key=...). Route them through a
  shared core.log_safety.redact_url() that drops userinfo/query/fragment.
- note_routes / task_scheduler logged operator email addresses (smtp_user,
  recipient). Replaced with presence booleans, which keeps the diagnostic
  ("why didn't this send") without writing PII to logs.

model_routes already had a local redactor on its HTTPStatusError branch;
the generic except branch was missed, so reuse the existing helper there.

Clears CodeQL py/clear-text-logging-sensitive-data alerts 264, 317, 324,
325, 343, 344, 528.

* fix(security): re-bracket IPv6 hosts and single-source the URL redactor

Address review on #4750:
- redact_url now re-brackets IPv6 literals so host:port stays
  unambiguous (https://[2001:db8::1]:8443/v1, not the bracket-less
  ambiguous form).
- point model_routes._redact_url_for_log at the shared helper so the
  two redactors are single-sourced (also picks up the IPv6 fix).
---
 core/log_safety.py        | 27 +++++++++++++++++++++++++++
 routes/chat_routes.py     |  3 ++-
 routes/contacts_routes.py |  3 ++-
 routes/model_routes.py    | 17 +++--------------
 routes/note_routes.py     |  9 +++++----
 src/task_scheduler.py     |  4 ++--
 tests/test_log_safety.py  | 32 ++++++++++++++++++++++++++++++++
 7 files changed, 73 insertions(+), 22 deletions(-)
 create mode 100644 core/log_safety.py
 create mode 100644 tests/test_log_safety.py
diff --git a/core/log_safety.py b/core/log_safety.py
new file mode 100644
index 000000000..2339a73b6
--- /dev/null
+++ b/core/log_safety.py
@@ -0,0 +1,27 @@
+"""Helpers for keeping sensitive data out of logs.
+
+Endpoint URLs configured by admins can embed credentials in the userinfo
+(``https://user:pass@host``) or query string (``?api_key=...``). Logging them
+raw leaks those secrets, so route/diagnostic logs run URLs through
+``redact_url`` first. Reconstructing the URL without userinfo/query/fragment
+also doubles as a sanitizer barrier for CodeQL's clear-text-logging query.
+"""
+
+from urllib.parse import urlparse, urlunparse
+
+
+def redact_url(url: str) -> str:
+    """Return a URL safe for logs by removing userinfo and query/fragment.
+
+    Keeps scheme, host, port and path so logs stay useful for debugging.
+    """
+    try:
+        parsed = urlparse(url or "")
+        host = parsed.hostname or ""
+        if ":" in host:  # IPv6 literal — re-bracket so host:port stays unambiguous
+            host = f"[{host}]"
+        if parsed.port:
+            host = f"{host}:{parsed.port}"
+        return urlunparse((parsed.scheme, host, parsed.path, "", "", ""))
+    except Exception:
+        return "<endpoint>"
diff --git a/routes/chat_routes.py b/routes/chat_routes.py
index b4a6ed837..a8632c0df 100644
--- a/routes/chat_routes.py
+++ b/routes/chat_routes.py
@@ -29,6 +29,7 @@ from routes.document_helpers import _owner_session_filter
 from core.database import SessionLocal, get_session_mode, set_session_mode
 from core.database import Session as DBSession, ChatMessage as DBChatMessage
 from core.database import Document as DBDocument, ModelEndpoint
+from core.log_safety import redact_url
 from routes.research_routes import _resolve_research_endpoint
 from routes.model_routes import _visible_models
 from routes.chat_helpers import (
@@ -930,7 +931,7 @@ def setup_chat_routes(
             if effective_do_research:
                 _r_ep, _r_model, _r_headers = _resolve_research_endpoint(sess)
                 _auth_keys = list(_r_headers.keys()) if _r_headers else []
-                logger.info(f"Research endpoint resolved: model={_r_model}, endpoint={_r_ep}, auth_keys={_auth_keys}, sess_headers_keys={list(sess.headers.keys()) if isinstance(sess.headers, dict) else type(sess.headers)}")
+                logger.info(f"Research endpoint resolved: model={_r_model}, endpoint={redact_url(_r_ep)}, auth_keys={_auth_keys}, sess_headers_keys={list(sess.headers.keys()) if isinstance(sess.headers, dict) else type(sess.headers)}")
 
                 # Clarification round: only for very short/vague queries on first research message.
                 # Skip in compare mode — each pane is a fresh session, so every one would
diff --git a/routes/contacts_routes.py b/routes/contacts_routes.py
index de0a5d041..7ba32d47a 100644
--- a/routes/contacts_routes.py
+++ b/routes/contacts_routes.py
@@ -18,6 +18,7 @@ from pathlib import Path
 from datetime import datetime
 from urllib.parse import urljoin, urlparse, urlunparse
 
+from core.log_safety import redact_url
 from fastapi import APIRouter, Query, Depends, Response, HTTPException
 from typing import List, Dict, Optional
 
@@ -702,7 +703,7 @@ def _delete_contact(uid: str) -> bool:
                 logger.warning(
                     f"CardDAV DELETE reported success for {uid} "
                     f"but UID still present after re-fetch — "
-                    f"resource URL may differ from {url}"
+                    f"resource URL may differ from {redact_url(url)}"
                 )
                 return False
             if r.status_code == 404:
diff --git a/routes/model_routes.py b/routes/model_routes.py
index fb9555438..dd5656f6a 100644
--- a/routes/model_routes.py
+++ b/routes/model_routes.py
@@ -17,6 +17,7 @@ from fastapi import APIRouter, HTTPException, Form, Query, Body, Request, Respon
 from pydantic import BaseModel
 from fastapi.responses import StreamingResponse
 from core.database import SessionLocal, ModelEndpoint, Session as DbSession
+from core.log_safety import redact_url as _redact_url_for_log
 from core.middleware import require_admin
 from src.llm_core import _detect_provider, _host_match, ANTHROPIC_MODELS
 from src.tls_overrides import llm_verify
@@ -582,18 +583,6 @@ def _safe_build_headers(api_key: Optional[str], base_url: str) -> dict:
         return {"Authorization": f"Bearer {api_key}"} if api_key else {}
 
 
-def _redact_url_for_log(url: str) -> str:
-    """Return a URL safe for logs by removing userinfo and query/fragment."""
-    try:
-        parsed = urlparse(url or "")
-        host = parsed.hostname or ""
-        if parsed.port:
-            host = f"{host}:{parsed.port}"
-        return urlunparse((parsed.scheme, host, parsed.path, "", "", ""))
-    except Exception:
-        return "<endpoint>"
-
-
 def _is_discovery_only_provider(provider: str) -> bool:
     return provider == "chatgpt-subscription"
 
@@ -810,9 +799,9 @@ def _probe_endpoint(base_url: str, api_key: str = None, timeout: int = 5) -> Lis
         logger.warning("Failed to probe %s: %s", _redact_url_for_log(url), e)
     except Exception as e:
         if api_key:
-            logger.warning(f"Failed to probe {url} with API key: {e}")
+            logger.warning("Failed to probe %s with API key: %s", _redact_url_for_log(url), e)
             return []
-        logger.warning(f"Failed to probe {url}: {e}")
+        logger.warning("Failed to probe %s: %s", _redact_url_for_log(url), e)
 
     # Older Ollama builds and some proxies expose native /api/tags even when
     # the OpenAI-compatible /v1/models path is unavailable.
diff --git a/routes/note_routes.py b/routes/note_routes.py
index c4674e489..2a91fedf8 100644
--- a/routes/note_routes.py
+++ b/routes/note_routes.py
@@ -335,10 +335,11 @@ async def dispatch_reminder(
             # Loud diagnostic so we can see WHY a reminder didn't send (the
             # previous "silently no-op when cfg has no smtp_host" was invisible).
             logger.info(
-                f"dispatch_reminder[email] note_id={note_id} owner={owner!r} "
-                f"smtp_host={cfg.get('smtp_host')!r} smtp_user={cfg.get('smtp_user')!r} "
-                f"from={from_addr!r} recipient={recipient!r} "
-                f"account_name={cfg.get('account_name')!r}"
+                "dispatch_reminder[email] note_id=%s owner=%r "
+                "has_smtp_host=%s has_smtp_user=%s has_from=%s has_recipient=%s",
+                note_id, owner,
+                bool(cfg.get("smtp_host")), bool(cfg.get("smtp_user")),
+                bool(from_addr), bool(recipient),
             )
             missing = []
             if not cfg.get("smtp_host"):
diff --git a/src/task_scheduler.py b/src/task_scheduler.py
index e5389df99..fa90eb92e 100644
--- a/src/task_scheduler.py
+++ b/src/task_scheduler.py
@@ -1667,7 +1667,7 @@ class TaskScheduler:
             msg["X-Odysseus-Ref"] = str(task.id)
             msg.set_content(result or "")
             _send_smtp_message(cfg, from_addr, [to_addr], msg.as_string(), timeout=30)
-            logger.info("Task %s emailed result to %s (%sb)", task.id, to_addr, len(result or ""))
+            logger.info("Task %s emailed result (recipient_set=%s, %sb)", task.id, bool(to_addr), len(result or ""))
         except Exception as e:
             logger.error("Task %s email delivery failed: %s", task.id, e, exc_info=True)
             raise
@@ -2029,7 +2029,7 @@ class TaskScheduler:
                 # silent SMTP failure is easier to spot in the logs.
                 logger.info(
                     f"Task {task.id} delivered via MCP tool {tool_name} "
-                    f"(to={recipient or '<unset>'}, body={body_len}b, reply={stdout[:200]!r})"
+                    f"(recipient_set={bool(recipient)}, body={body_len}b, reply={stdout[:200]!r})"
                 )
         except Exception as e:
             logger.error(f"Task {task.id} MCP delivery failed: {e}")
diff --git a/tests/test_log_safety.py b/tests/test_log_safety.py
new file mode 100644
index 000000000..b806516d2
--- /dev/null
+++ b/tests/test_log_safety.py
@@ -0,0 +1,32 @@
+from core.log_safety import redact_url
+
+
+def test_strips_userinfo():
+    assert redact_url("https://user:pass@host.example/v1/models") == "https://host.example/v1/models"
+
+
+def test_strips_query_and_fragment():
+    assert redact_url("https://host.example/v1?api_key=secret#frag") == "https://host.example/v1"
+
+
+def test_keeps_port_and_path():
+    assert redact_url("http://host.example:8080/api/tags") == "http://host.example:8080/api/tags"
+
+
+def test_ipv6_host_keeps_brackets():
+    assert redact_url("https://user:pass@[2001:db8::1]:8443/v1") == "https://[2001:db8::1]:8443/v1"
+    assert redact_url("https://[2001:db8::1]/v1") == "https://[2001:db8::1]/v1"
+
+
+def test_no_credentials_passthrough():
+    assert redact_url("https://host.example/v1/models") == "https://host.example/v1/models"
+
+
+def test_empty_and_none():
+    assert redact_url("") == ""
+    assert redact_url(None) == ""
+
+
+def test_garbage_does_not_raise():
+    # urlparse is lenient; just assert no credential-looking userinfo survives.
+    assert "@" not in redact_url("::::not a url::::")