fix(security): redact credential-bearing URLs and PII from logs (#4750)

* fix(security): redact credential-bearing URLs and PII from logs

Several log statements emitted sensitive data in clear text:

- model_routes / chat_routes / contacts_routes logged endpoint URLs raw.
  Admin-configured URLs can embed credentials in userinfo or query
  (e.g. https://user:pass@host, ?api_key=...). Route them through a
  shared core.log_safety.redact_url() that drops userinfo/query/fragment.
- note_routes / task_scheduler logged operator email addresses (smtp_user,
  recipient). Replaced with presence booleans, which keeps the diagnostic
  ("why didn't this send") without writing PII to logs.

model_routes already had a local redactor on its HTTPStatusError branch;
the generic except branch was missed, so reuse the existing helper there.

Clears CodeQL py/clear-text-logging-sensitive-data alerts 264, 317, 324,
325, 343, 344, 528.

* fix(security): re-bracket IPv6 hosts and single-source the URL redactor

Address review on #4750:
- redact_url now re-brackets IPv6 literals so host:port stays
  unambiguous (https://[2001:db8::1]:8443/v1, not the bracket-less
  ambiguous form).
- point model_routes._redact_url_for_log at the shared helper so the
  two redactors are single-sourced (also picks up the IPv6 fix).
This commit is contained in:
nopoz
2026-06-22 14:12:39 -07:00
committed by GitHub
parent 2f246c7779
commit 7e5db9a3c6
7 changed files with 73 additions and 22 deletions
+27
View File
@@ -0,0 +1,27 @@
"""Helpers for keeping sensitive data out of logs.
Endpoint URLs configured by admins can embed credentials in the userinfo
(``https://user:pass@host``) or query string (``?api_key=...``). Logging them
raw leaks those secrets, so route/diagnostic logs run URLs through
``redact_url`` first. Reconstructing the URL without userinfo/query/fragment
also doubles as a sanitizer barrier for CodeQL's clear-text-logging query.
"""
from urllib.parse import urlparse, urlunparse
def redact_url(url: str) -> str:
"""Return a URL safe for logs by removing userinfo and query/fragment.
Keeps scheme, host, port and path so logs stay useful for debugging.
"""
try:
parsed = urlparse(url or "")
host = parsed.hostname or ""
if ":" in host: # IPv6 literal — re-bracket so host:port stays unambiguous
host = f"[{host}]"
if parsed.port:
host = f"{host}:{parsed.port}"
return urlunparse((parsed.scheme, host, parsed.path, "", "", ""))
except Exception:
return "<endpoint>"
+2 -1
View File
@@ -29,6 +29,7 @@ from routes.document_helpers import _owner_session_filter
from core.database import SessionLocal, get_session_mode, set_session_mode
from core.database import Session as DBSession, ChatMessage as DBChatMessage
from core.database import Document as DBDocument, ModelEndpoint
from core.log_safety import redact_url
from routes.research_routes import _resolve_research_endpoint
from routes.model_routes import _visible_models
from routes.chat_helpers import (
@@ -930,7 +931,7 @@ def setup_chat_routes(
if effective_do_research:
_r_ep, _r_model, _r_headers = _resolve_research_endpoint(sess)
_auth_keys = list(_r_headers.keys()) if _r_headers else []
logger.info(f"Research endpoint resolved: model={_r_model}, endpoint={_r_ep}, auth_keys={_auth_keys}, sess_headers_keys={list(sess.headers.keys()) if isinstance(sess.headers, dict) else type(sess.headers)}")
logger.info(f"Research endpoint resolved: model={_r_model}, endpoint={redact_url(_r_ep)}, auth_keys={_auth_keys}, sess_headers_keys={list(sess.headers.keys()) if isinstance(sess.headers, dict) else type(sess.headers)}")
# Clarification round: only for very short/vague queries on first research message.
# Skip in compare mode — each pane is a fresh session, so every one would
+2 -1
View File
@@ -18,6 +18,7 @@ from pathlib import Path
from datetime import datetime
from urllib.parse import urljoin, urlparse, urlunparse
from core.log_safety import redact_url
from fastapi import APIRouter, Query, Depends, Response, HTTPException
from typing import List, Dict, Optional
@@ -702,7 +703,7 @@ def _delete_contact(uid: str) -> bool:
logger.warning(
f"CardDAV DELETE reported success for {uid} "
f"but UID still present after re-fetch — "
f"resource URL may differ from {url}"
f"resource URL may differ from {redact_url(url)}"
)
return False
if r.status_code == 404:
+3 -14
View File
@@ -17,6 +17,7 @@ from fastapi import APIRouter, HTTPException, Form, Query, Body, Request, Respon
from pydantic import BaseModel
from fastapi.responses import StreamingResponse
from core.database import SessionLocal, ModelEndpoint, Session as DbSession
from core.log_safety import redact_url as _redact_url_for_log
from core.middleware import require_admin
from src.llm_core import _detect_provider, _host_match, ANTHROPIC_MODELS
from src.tls_overrides import llm_verify
@@ -582,18 +583,6 @@ def _safe_build_headers(api_key: Optional[str], base_url: str) -> dict:
return {"Authorization": f"Bearer {api_key}"} if api_key else {}
def _redact_url_for_log(url: str) -> str:
"""Return a URL safe for logs by removing userinfo and query/fragment."""
try:
parsed = urlparse(url or "")
host = parsed.hostname or ""
if parsed.port:
host = f"{host}:{parsed.port}"
return urlunparse((parsed.scheme, host, parsed.path, "", "", ""))
except Exception:
return "<endpoint>"
def _is_discovery_only_provider(provider: str) -> bool:
return provider == "chatgpt-subscription"
@@ -810,9 +799,9 @@ def _probe_endpoint(base_url: str, api_key: str = None, timeout: int = 5) -> Lis
logger.warning("Failed to probe %s: %s", _redact_url_for_log(url), e)
except Exception as e:
if api_key:
logger.warning(f"Failed to probe {url} with API key: {e}")
logger.warning("Failed to probe %s with API key: %s", _redact_url_for_log(url), e)
return []
logger.warning(f"Failed to probe {url}: {e}")
logger.warning("Failed to probe %s: %s", _redact_url_for_log(url), e)
# Older Ollama builds and some proxies expose native /api/tags even when
# the OpenAI-compatible /v1/models path is unavailable.
+5 -4
View File
@@ -335,10 +335,11 @@ async def dispatch_reminder(
# Loud diagnostic so we can see WHY a reminder didn't send (the
# previous "silently no-op when cfg has no smtp_host" was invisible).
logger.info(
f"dispatch_reminder[email] note_id={note_id} owner={owner!r} "
f"smtp_host={cfg.get('smtp_host')!r} smtp_user={cfg.get('smtp_user')!r} "
f"from={from_addr!r} recipient={recipient!r} "
f"account_name={cfg.get('account_name')!r}"
"dispatch_reminder[email] note_id=%s owner=%r "
"has_smtp_host=%s has_smtp_user=%s has_from=%s has_recipient=%s",
note_id, owner,
bool(cfg.get("smtp_host")), bool(cfg.get("smtp_user")),
bool(from_addr), bool(recipient),
)
missing = []
if not cfg.get("smtp_host"):
+2 -2
View File
@@ -1667,7 +1667,7 @@ class TaskScheduler:
msg["X-Odysseus-Ref"] = str(task.id)
msg.set_content(result or "")
_send_smtp_message(cfg, from_addr, [to_addr], msg.as_string(), timeout=30)
logger.info("Task %s emailed result to %s (%sb)", task.id, to_addr, len(result or ""))
logger.info("Task %s emailed result (recipient_set=%s, %sb)", task.id, bool(to_addr), len(result or ""))
except Exception as e:
logger.error("Task %s email delivery failed: %s", task.id, e, exc_info=True)
raise
@@ -2029,7 +2029,7 @@ class TaskScheduler:
# silent SMTP failure is easier to spot in the logs.
logger.info(
f"Task {task.id} delivered via MCP tool {tool_name} "
f"(to={recipient or '<unset>'}, body={body_len}b, reply={stdout[:200]!r})"
f"(recipient_set={bool(recipient)}, body={body_len}b, reply={stdout[:200]!r})"
)
except Exception as e:
logger.error(f"Task {task.id} MCP delivery failed: {e}")
+32
View File
@@ -0,0 +1,32 @@
from core.log_safety import redact_url
def test_strips_userinfo():
assert redact_url("https://user:pass@host.example/v1/models") == "https://host.example/v1/models"
def test_strips_query_and_fragment():
assert redact_url("https://host.example/v1?api_key=secret#frag") == "https://host.example/v1"
def test_keeps_port_and_path():
assert redact_url("http://host.example:8080/api/tags") == "http://host.example:8080/api/tags"
def test_ipv6_host_keeps_brackets():
assert redact_url("https://user:pass@[2001:db8::1]:8443/v1") == "https://[2001:db8::1]:8443/v1"
assert redact_url("https://[2001:db8::1]/v1") == "https://[2001:db8::1]/v1"
def test_no_credentials_passthrough():
assert redact_url("https://host.example/v1/models") == "https://host.example/v1/models"
def test_empty_and_none():
assert redact_url("") == ""
assert redact_url(None) == ""
def test_garbage_does_not_raise():
# urlparse is lenient; just assert no credential-looking userinfo survives.
assert "@" not in redact_url("::::not a url::::")