From 7e5db9a3c6346caaabf1f0f49bab4be39be2ffe7 Mon Sep 17 00:00:00 2001 From: nopoz Date: Mon, 22 Jun 2026 14:12:39 -0700 Subject: [PATCH] fix(security): redact credential-bearing URLs and PII from logs (#4750) * fix(security): redact credential-bearing URLs and PII from logs Several log statements emitted sensitive data in clear text: - model_routes / chat_routes / contacts_routes logged endpoint URLs raw. Admin-configured URLs can embed credentials in userinfo or query (e.g. https://user:pass@host, ?api_key=...). Route them through a shared core.log_safety.redact_url() that drops userinfo/query/fragment. - note_routes / task_scheduler logged operator email addresses (smtp_user, recipient). Replaced with presence booleans, which keeps the diagnostic ("why didn't this send") without writing PII to logs. model_routes already had a local redactor on its HTTPStatusError branch; the generic except branch was missed, so reuse the existing helper there. Clears CodeQL py/clear-text-logging-sensitive-data alerts 264, 317, 324, 325, 343, 344, 528. * fix(security): re-bracket IPv6 hosts and single-source the URL redactor Address review on #4750: - redact_url now re-brackets IPv6 literals so host:port stays unambiguous (https://[2001:db8::1]:8443/v1, not the bracket-less ambiguous form). - point model_routes._redact_url_for_log at the shared helper so the two redactors are single-sourced (also picks up the IPv6 fix). --- core/log_safety.py | 27 +++++++++++++++++++++++++++ routes/chat_routes.py | 3 ++- routes/contacts_routes.py | 3 ++- routes/model_routes.py | 17 +++-------------- routes/note_routes.py | 9 +++++---- src/task_scheduler.py | 4 ++-- tests/test_log_safety.py | 32 ++++++++++++++++++++++++++++++++ 7 files changed, 73 insertions(+), 22 deletions(-) create mode 100644 core/log_safety.py create mode 100644 tests/test_log_safety.py diff --git a/core/log_safety.py b/core/log_safety.py new file mode 100644 index 000000000..2339a73b6 --- /dev/null +++ b/core/log_safety.py @@ -0,0 +1,27 @@ +"""Helpers for keeping sensitive data out of logs. + +Endpoint URLs configured by admins can embed credentials in the userinfo +(``https://user:pass@host``) or query string (``?api_key=...``). Logging them +raw leaks those secrets, so route/diagnostic logs run URLs through +``redact_url`` first. Reconstructing the URL without userinfo/query/fragment +also doubles as a sanitizer barrier for CodeQL's clear-text-logging query. +""" + +from urllib.parse import urlparse, urlunparse + + +def redact_url(url: str) -> str: + """Return a URL safe for logs by removing userinfo and query/fragment. + + Keeps scheme, host, port and path so logs stay useful for debugging. + """ + try: + parsed = urlparse(url or "") + host = parsed.hostname or "" + if ":" in host: # IPv6 literal — re-bracket so host:port stays unambiguous + host = f"[{host}]" + if parsed.port: + host = f"{host}:{parsed.port}" + return urlunparse((parsed.scheme, host, parsed.path, "", "", "")) + except Exception: + return "" diff --git a/routes/chat_routes.py b/routes/chat_routes.py index b4a6ed837..a8632c0df 100644 --- a/routes/chat_routes.py +++ b/routes/chat_routes.py @@ -29,6 +29,7 @@ from routes.document_helpers import _owner_session_filter from core.database import SessionLocal, get_session_mode, set_session_mode from core.database import Session as DBSession, ChatMessage as DBChatMessage from core.database import Document as DBDocument, ModelEndpoint +from core.log_safety import redact_url from routes.research_routes import _resolve_research_endpoint from routes.model_routes import _visible_models from routes.chat_helpers import ( @@ -930,7 +931,7 @@ def setup_chat_routes( if effective_do_research: _r_ep, _r_model, _r_headers = _resolve_research_endpoint(sess) _auth_keys = list(_r_headers.keys()) if _r_headers else [] - logger.info(f"Research endpoint resolved: model={_r_model}, endpoint={_r_ep}, auth_keys={_auth_keys}, sess_headers_keys={list(sess.headers.keys()) if isinstance(sess.headers, dict) else type(sess.headers)}") + logger.info(f"Research endpoint resolved: model={_r_model}, endpoint={redact_url(_r_ep)}, auth_keys={_auth_keys}, sess_headers_keys={list(sess.headers.keys()) if isinstance(sess.headers, dict) else type(sess.headers)}") # Clarification round: only for very short/vague queries on first research message. # Skip in compare mode — each pane is a fresh session, so every one would diff --git a/routes/contacts_routes.py b/routes/contacts_routes.py index de0a5d041..7ba32d47a 100644 --- a/routes/contacts_routes.py +++ b/routes/contacts_routes.py @@ -18,6 +18,7 @@ from pathlib import Path from datetime import datetime from urllib.parse import urljoin, urlparse, urlunparse +from core.log_safety import redact_url from fastapi import APIRouter, Query, Depends, Response, HTTPException from typing import List, Dict, Optional @@ -702,7 +703,7 @@ def _delete_contact(uid: str) -> bool: logger.warning( f"CardDAV DELETE reported success for {uid} " f"but UID still present after re-fetch — " - f"resource URL may differ from {url}" + f"resource URL may differ from {redact_url(url)}" ) return False if r.status_code == 404: diff --git a/routes/model_routes.py b/routes/model_routes.py index fb9555438..dd5656f6a 100644 --- a/routes/model_routes.py +++ b/routes/model_routes.py @@ -17,6 +17,7 @@ from fastapi import APIRouter, HTTPException, Form, Query, Body, Request, Respon from pydantic import BaseModel from fastapi.responses import StreamingResponse from core.database import SessionLocal, ModelEndpoint, Session as DbSession +from core.log_safety import redact_url as _redact_url_for_log from core.middleware import require_admin from src.llm_core import _detect_provider, _host_match, ANTHROPIC_MODELS from src.tls_overrides import llm_verify @@ -582,18 +583,6 @@ def _safe_build_headers(api_key: Optional[str], base_url: str) -> dict: return {"Authorization": f"Bearer {api_key}"} if api_key else {} -def _redact_url_for_log(url: str) -> str: - """Return a URL safe for logs by removing userinfo and query/fragment.""" - try: - parsed = urlparse(url or "") - host = parsed.hostname or "" - if parsed.port: - host = f"{host}:{parsed.port}" - return urlunparse((parsed.scheme, host, parsed.path, "", "", "")) - except Exception: - return "" - - def _is_discovery_only_provider(provider: str) -> bool: return provider == "chatgpt-subscription" @@ -810,9 +799,9 @@ def _probe_endpoint(base_url: str, api_key: str = None, timeout: int = 5) -> Lis logger.warning("Failed to probe %s: %s", _redact_url_for_log(url), e) except Exception as e: if api_key: - logger.warning(f"Failed to probe {url} with API key: {e}") + logger.warning("Failed to probe %s with API key: %s", _redact_url_for_log(url), e) return [] - logger.warning(f"Failed to probe {url}: {e}") + logger.warning("Failed to probe %s: %s", _redact_url_for_log(url), e) # Older Ollama builds and some proxies expose native /api/tags even when # the OpenAI-compatible /v1/models path is unavailable. diff --git a/routes/note_routes.py b/routes/note_routes.py index c4674e489..2a91fedf8 100644 --- a/routes/note_routes.py +++ b/routes/note_routes.py @@ -335,10 +335,11 @@ async def dispatch_reminder( # Loud diagnostic so we can see WHY a reminder didn't send (the # previous "silently no-op when cfg has no smtp_host" was invisible). logger.info( - f"dispatch_reminder[email] note_id={note_id} owner={owner!r} " - f"smtp_host={cfg.get('smtp_host')!r} smtp_user={cfg.get('smtp_user')!r} " - f"from={from_addr!r} recipient={recipient!r} " - f"account_name={cfg.get('account_name')!r}" + "dispatch_reminder[email] note_id=%s owner=%r " + "has_smtp_host=%s has_smtp_user=%s has_from=%s has_recipient=%s", + note_id, owner, + bool(cfg.get("smtp_host")), bool(cfg.get("smtp_user")), + bool(from_addr), bool(recipient), ) missing = [] if not cfg.get("smtp_host"): diff --git a/src/task_scheduler.py b/src/task_scheduler.py index e5389df99..fa90eb92e 100644 --- a/src/task_scheduler.py +++ b/src/task_scheduler.py @@ -1667,7 +1667,7 @@ class TaskScheduler: msg["X-Odysseus-Ref"] = str(task.id) msg.set_content(result or "") _send_smtp_message(cfg, from_addr, [to_addr], msg.as_string(), timeout=30) - logger.info("Task %s emailed result to %s (%sb)", task.id, to_addr, len(result or "")) + logger.info("Task %s emailed result (recipient_set=%s, %sb)", task.id, bool(to_addr), len(result or "")) except Exception as e: logger.error("Task %s email delivery failed: %s", task.id, e, exc_info=True) raise @@ -2029,7 +2029,7 @@ class TaskScheduler: # silent SMTP failure is easier to spot in the logs. logger.info( f"Task {task.id} delivered via MCP tool {tool_name} " - f"(to={recipient or ''}, body={body_len}b, reply={stdout[:200]!r})" + f"(recipient_set={bool(recipient)}, body={body_len}b, reply={stdout[:200]!r})" ) except Exception as e: logger.error(f"Task {task.id} MCP delivery failed: {e}") diff --git a/tests/test_log_safety.py b/tests/test_log_safety.py new file mode 100644 index 000000000..b806516d2 --- /dev/null +++ b/tests/test_log_safety.py @@ -0,0 +1,32 @@ +from core.log_safety import redact_url + + +def test_strips_userinfo(): + assert redact_url("https://user:pass@host.example/v1/models") == "https://host.example/v1/models" + + +def test_strips_query_and_fragment(): + assert redact_url("https://host.example/v1?api_key=secret#frag") == "https://host.example/v1" + + +def test_keeps_port_and_path(): + assert redact_url("http://host.example:8080/api/tags") == "http://host.example:8080/api/tags" + + +def test_ipv6_host_keeps_brackets(): + assert redact_url("https://user:pass@[2001:db8::1]:8443/v1") == "https://[2001:db8::1]:8443/v1" + assert redact_url("https://[2001:db8::1]/v1") == "https://[2001:db8::1]/v1" + + +def test_no_credentials_passthrough(): + assert redact_url("https://host.example/v1/models") == "https://host.example/v1/models" + + +def test_empty_and_none(): + assert redact_url("") == "" + assert redact_url(None) == "" + + +def test_garbage_does_not_raise(): + # urlparse is lenient; just assert no credential-looking userinfo survives. + assert "@" not in redact_url("::::not a url::::")