fix(security): redact credential-bearing URLs and PII from logs (#4750)

* fix(security): redact credential-bearing URLs and PII from logs

Several log statements emitted sensitive data in clear text:

- model_routes / chat_routes / contacts_routes logged endpoint URLs raw.
  Admin-configured URLs can embed credentials in userinfo or query
  (e.g. https://user:pass@host, ?api_key=...). Route them through a
  shared core.log_safety.redact_url() that drops userinfo/query/fragment.
- note_routes / task_scheduler logged operator email addresses (smtp_user,
  recipient). Replaced with presence booleans, which keeps the diagnostic
  ("why didn't this send") without writing PII to logs.

model_routes already had a local redactor on its HTTPStatusError branch;
the generic except branch was missed, so reuse the existing helper there.

Clears CodeQL py/clear-text-logging-sensitive-data alerts 264, 317, 324,
325, 343, 344, 528.

* fix(security): re-bracket IPv6 hosts and single-source the URL redactor

Address review on #4750:
- redact_url now re-brackets IPv6 literals so host:port stays
  unambiguous (https://[2001:db8::1]:8443/v1, not the bracket-less
  ambiguous form).
- point model_routes._redact_url_for_log at the shared helper so the
  two redactors are single-sourced (also picks up the IPv6 fix).
This commit is contained in:
nopoz
2026-06-22 14:12:39 -07:00
committed by GitHub
parent 2f246c7779
commit 7e5db9a3c6
7 changed files with 73 additions and 22 deletions
+27
View File
@@ -0,0 +1,27 @@
"""Helpers for keeping sensitive data out of logs.
Endpoint URLs configured by admins can embed credentials in the userinfo
(``https://user:pass@host``) or query string (``?api_key=...``). Logging them
raw leaks those secrets, so route/diagnostic logs run URLs through
``redact_url`` first. Reconstructing the URL without userinfo/query/fragment
also doubles as a sanitizer barrier for CodeQL's clear-text-logging query.
"""
from urllib.parse import urlparse, urlunparse
def redact_url(url: str) -> str:
"""Return a URL safe for logs by removing userinfo and query/fragment.
Keeps scheme, host, port and path so logs stay useful for debugging.
"""
try:
parsed = urlparse(url or "")
host = parsed.hostname or ""
if ":" in host: # IPv6 literal — re-bracket so host:port stays unambiguous
host = f"[{host}]"
if parsed.port:
host = f"{host}:{parsed.port}"
return urlunparse((parsed.scheme, host, parsed.path, "", "", ""))
except Exception:
return "<endpoint>"
+2 -1
View File
@@ -29,6 +29,7 @@ from routes.document_helpers import _owner_session_filter
from core.database import SessionLocal, get_session_mode, set_session_mode from core.database import SessionLocal, get_session_mode, set_session_mode
from core.database import Session as DBSession, ChatMessage as DBChatMessage from core.database import Session as DBSession, ChatMessage as DBChatMessage
from core.database import Document as DBDocument, ModelEndpoint from core.database import Document as DBDocument, ModelEndpoint
from core.log_safety import redact_url
from routes.research_routes import _resolve_research_endpoint from routes.research_routes import _resolve_research_endpoint
from routes.model_routes import _visible_models from routes.model_routes import _visible_models
from routes.chat_helpers import ( from routes.chat_helpers import (
@@ -930,7 +931,7 @@ def setup_chat_routes(
if effective_do_research: if effective_do_research:
_r_ep, _r_model, _r_headers = _resolve_research_endpoint(sess) _r_ep, _r_model, _r_headers = _resolve_research_endpoint(sess)
_auth_keys = list(_r_headers.keys()) if _r_headers else [] _auth_keys = list(_r_headers.keys()) if _r_headers else []
logger.info(f"Research endpoint resolved: model={_r_model}, endpoint={_r_ep}, auth_keys={_auth_keys}, sess_headers_keys={list(sess.headers.keys()) if isinstance(sess.headers, dict) else type(sess.headers)}") logger.info(f"Research endpoint resolved: model={_r_model}, endpoint={redact_url(_r_ep)}, auth_keys={_auth_keys}, sess_headers_keys={list(sess.headers.keys()) if isinstance(sess.headers, dict) else type(sess.headers)}")
# Clarification round: only for very short/vague queries on first research message. # Clarification round: only for very short/vague queries on first research message.
# Skip in compare mode — each pane is a fresh session, so every one would # Skip in compare mode — each pane is a fresh session, so every one would
+2 -1
View File
@@ -18,6 +18,7 @@ from pathlib import Path
from datetime import datetime from datetime import datetime
from urllib.parse import urljoin, urlparse, urlunparse from urllib.parse import urljoin, urlparse, urlunparse
from core.log_safety import redact_url
from fastapi import APIRouter, Query, Depends, Response, HTTPException from fastapi import APIRouter, Query, Depends, Response, HTTPException
from typing import List, Dict, Optional from typing import List, Dict, Optional
@@ -702,7 +703,7 @@ def _delete_contact(uid: str) -> bool:
logger.warning( logger.warning(
f"CardDAV DELETE reported success for {uid} " f"CardDAV DELETE reported success for {uid} "
f"but UID still present after re-fetch — " f"but UID still present after re-fetch — "
f"resource URL may differ from {url}" f"resource URL may differ from {redact_url(url)}"
) )
return False return False
if r.status_code == 404: if r.status_code == 404:
+3 -14
View File
@@ -17,6 +17,7 @@ from fastapi import APIRouter, HTTPException, Form, Query, Body, Request, Respon
from pydantic import BaseModel from pydantic import BaseModel
from fastapi.responses import StreamingResponse from fastapi.responses import StreamingResponse
from core.database import SessionLocal, ModelEndpoint, Session as DbSession from core.database import SessionLocal, ModelEndpoint, Session as DbSession
from core.log_safety import redact_url as _redact_url_for_log
from core.middleware import require_admin from core.middleware import require_admin
from src.llm_core import _detect_provider, _host_match, ANTHROPIC_MODELS from src.llm_core import _detect_provider, _host_match, ANTHROPIC_MODELS
from src.tls_overrides import llm_verify from src.tls_overrides import llm_verify
@@ -582,18 +583,6 @@ def _safe_build_headers(api_key: Optional[str], base_url: str) -> dict:
return {"Authorization": f"Bearer {api_key}"} if api_key else {} return {"Authorization": f"Bearer {api_key}"} if api_key else {}
def _redact_url_for_log(url: str) -> str:
"""Return a URL safe for logs by removing userinfo and query/fragment."""
try:
parsed = urlparse(url or "")
host = parsed.hostname or ""
if parsed.port:
host = f"{host}:{parsed.port}"
return urlunparse((parsed.scheme, host, parsed.path, "", "", ""))
except Exception:
return "<endpoint>"
def _is_discovery_only_provider(provider: str) -> bool: def _is_discovery_only_provider(provider: str) -> bool:
return provider == "chatgpt-subscription" return provider == "chatgpt-subscription"
@@ -810,9 +799,9 @@ def _probe_endpoint(base_url: str, api_key: str = None, timeout: int = 5) -> Lis
logger.warning("Failed to probe %s: %s", _redact_url_for_log(url), e) logger.warning("Failed to probe %s: %s", _redact_url_for_log(url), e)
except Exception as e: except Exception as e:
if api_key: if api_key:
logger.warning(f"Failed to probe {url} with API key: {e}") logger.warning("Failed to probe %s with API key: %s", _redact_url_for_log(url), e)
return [] return []
logger.warning(f"Failed to probe {url}: {e}") logger.warning("Failed to probe %s: %s", _redact_url_for_log(url), e)
# Older Ollama builds and some proxies expose native /api/tags even when # Older Ollama builds and some proxies expose native /api/tags even when
# the OpenAI-compatible /v1/models path is unavailable. # the OpenAI-compatible /v1/models path is unavailable.
+5 -4
View File
@@ -335,10 +335,11 @@ async def dispatch_reminder(
# Loud diagnostic so we can see WHY a reminder didn't send (the # Loud diagnostic so we can see WHY a reminder didn't send (the
# previous "silently no-op when cfg has no smtp_host" was invisible). # previous "silently no-op when cfg has no smtp_host" was invisible).
logger.info( logger.info(
f"dispatch_reminder[email] note_id={note_id} owner={owner!r} " "dispatch_reminder[email] note_id=%s owner=%r "
f"smtp_host={cfg.get('smtp_host')!r} smtp_user={cfg.get('smtp_user')!r} " "has_smtp_host=%s has_smtp_user=%s has_from=%s has_recipient=%s",
f"from={from_addr!r} recipient={recipient!r} " note_id, owner,
f"account_name={cfg.get('account_name')!r}" bool(cfg.get("smtp_host")), bool(cfg.get("smtp_user")),
bool(from_addr), bool(recipient),
) )
missing = [] missing = []
if not cfg.get("smtp_host"): if not cfg.get("smtp_host"):
+2 -2
View File
@@ -1667,7 +1667,7 @@ class TaskScheduler:
msg["X-Odysseus-Ref"] = str(task.id) msg["X-Odysseus-Ref"] = str(task.id)
msg.set_content(result or "") msg.set_content(result or "")
_send_smtp_message(cfg, from_addr, [to_addr], msg.as_string(), timeout=30) _send_smtp_message(cfg, from_addr, [to_addr], msg.as_string(), timeout=30)
logger.info("Task %s emailed result to %s (%sb)", task.id, to_addr, len(result or "")) logger.info("Task %s emailed result (recipient_set=%s, %sb)", task.id, bool(to_addr), len(result or ""))
except Exception as e: except Exception as e:
logger.error("Task %s email delivery failed: %s", task.id, e, exc_info=True) logger.error("Task %s email delivery failed: %s", task.id, e, exc_info=True)
raise raise
@@ -2029,7 +2029,7 @@ class TaskScheduler:
# silent SMTP failure is easier to spot in the logs. # silent SMTP failure is easier to spot in the logs.
logger.info( logger.info(
f"Task {task.id} delivered via MCP tool {tool_name} " f"Task {task.id} delivered via MCP tool {tool_name} "
f"(to={recipient or '<unset>'}, body={body_len}b, reply={stdout[:200]!r})" f"(recipient_set={bool(recipient)}, body={body_len}b, reply={stdout[:200]!r})"
) )
except Exception as e: except Exception as e:
logger.error(f"Task {task.id} MCP delivery failed: {e}") logger.error(f"Task {task.id} MCP delivery failed: {e}")
+32
View File
@@ -0,0 +1,32 @@
from core.log_safety import redact_url
def test_strips_userinfo():
assert redact_url("https://user:pass@host.example/v1/models") == "https://host.example/v1/models"
def test_strips_query_and_fragment():
assert redact_url("https://host.example/v1?api_key=secret#frag") == "https://host.example/v1"
def test_keeps_port_and_path():
assert redact_url("http://host.example:8080/api/tags") == "http://host.example:8080/api/tags"
def test_ipv6_host_keeps_brackets():
assert redact_url("https://user:pass@[2001:db8::1]:8443/v1") == "https://[2001:db8::1]:8443/v1"
assert redact_url("https://[2001:db8::1]/v1") == "https://[2001:db8::1]/v1"
def test_no_credentials_passthrough():
assert redact_url("https://host.example/v1/models") == "https://host.example/v1/models"
def test_empty_and_none():
assert redact_url("") == ""
assert redact_url(None) == ""
def test_garbage_does_not_raise():
# urlparse is lenient; just assert no credential-looking userinfo survives.
assert "@" not in redact_url("::::not a url::::")