mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-30 08:32:07 -04:00
fix(chat): sanitize web search query to strip markdown and code blocks (#4863)
Layer a defensive cleanup on top of the generated-query web-search flow so the final selected query is sanitized before reaching comprehensive_web_search. - remove fenced code blocks from the final search query - preserve inline code as plain text - collapse whitespace and cap query length - cover generated-query success plus LLM failure/empty fallback paths Partially addresses #4547.
This commit is contained in:
@@ -12,6 +12,45 @@ from src.prompt_security import UNTRUSTED_CONTEXT_POLICY, untrusted_context_mess
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _clean_search_query(query: str, max_len: int = 200) -> str:
|
||||||
|
"""Strip fenced code blocks from a search query while preserving inline
|
||||||
|
code text.
|
||||||
|
|
||||||
|
This is a focused, defensive cleanup for the *final* web-search query
|
||||||
|
selected in ``build_context_preface`` (issue #4547): regardless of whether
|
||||||
|
the query came from the LLM-generated path (#4557) or the first-line
|
||||||
|
fallback, residual fenced / inline markdown should not leak into the search
|
||||||
|
call. Rather than using regex (which is brittle and strips inline code
|
||||||
|
text like ``git reset`` from the query), we render the query to HTML via
|
||||||
|
``markdown`` and parse it with ``BeautifulSoup`` so that:
|
||||||
|
|
||||||
|
* ``<pre>`` blocks (fenced / indented code) are removed entirely.
|
||||||
|
* ``<code>`` elements (inline code) are preserved as plain text.
|
||||||
|
|
||||||
|
Both libraries are already project dependencies. The result is whitespace
|
||||||
|
collapsed and truncated to ``max_len``; an all-code input collapses to an
|
||||||
|
empty string, which the caller treats as "no query".
|
||||||
|
"""
|
||||||
|
import markdown as _md
|
||||||
|
from bs4 import BeautifulSoup as _BS
|
||||||
|
|
||||||
|
html = _md.markdown(query, extensions=["fenced_code"])
|
||||||
|
soup = _BS(html, "html.parser")
|
||||||
|
|
||||||
|
# Remove fenced / indented code blocks.
|
||||||
|
for pre in soup.find_all("pre"):
|
||||||
|
pre.decompose()
|
||||||
|
|
||||||
|
# Preserve inline code by unwrapping <code> to text.
|
||||||
|
for code in soup.find_all("code"):
|
||||||
|
code.replace_with(code.get_text())
|
||||||
|
|
||||||
|
text = soup.get_text(" ", strip=True)
|
||||||
|
text = re.sub(r"\s+", " ", text)
|
||||||
|
return text[:max_len]
|
||||||
|
|
||||||
|
|
||||||
# ── Stopwords & tokenizer ──
|
# ── Stopwords & tokenizer ──
|
||||||
|
|
||||||
_STOPWORDS = frozenset(
|
_STOPWORDS = frozenset(
|
||||||
@@ -322,6 +361,13 @@ class ChatProcessor:
|
|||||||
if len(search_query) > 150:
|
if len(search_query) > 150:
|
||||||
search_query = search_query[:150].strip()
|
search_query = search_query[:150].strip()
|
||||||
|
|
||||||
|
# Defensive cleanup of the final selected query (interim fix
|
||||||
|
# for #4547): strip any residual fenced/inline markdown so that
|
||||||
|
# neither the generated query nor the first-line fallback leaks
|
||||||
|
# fences or backticks into the search call. No-op on clean
|
||||||
|
# generated queries; collapses to "" when the query is all code.
|
||||||
|
search_query = _clean_search_query(search_query, max_len=150)
|
||||||
|
|
||||||
if search_query:
|
if search_query:
|
||||||
# Execute web search using the final selected query
|
# Execute web search using the final selected query
|
||||||
web_context, web_sources = comprehensive_web_search(
|
web_context, web_sources = comprehensive_web_search(
|
||||||
|
|||||||
@@ -0,0 +1,209 @@
|
|||||||
|
"""Regression tests for #4547 — chat-mode web search query sanitization.
|
||||||
|
|
||||||
|
Chat-mode web search (``use_web``) selects a search query via the
|
||||||
|
generated-query flow added in #4557: an LLM extracts a concise query, falling
|
||||||
|
back to the first non-empty line of the user message when the LLM fails or
|
||||||
|
returns an empty result. PR #4863 layers a focused, *defensive* cleanup on top
|
||||||
|
of that flow: whatever query is finally selected (generated or fallback) is
|
||||||
|
passed through ``_clean_search_query()`` before reaching
|
||||||
|
``comprehensive_web_search()``, so residual fenced/inline markdown never leaks
|
||||||
|
into the search call.
|
||||||
|
|
||||||
|
``_clean_search_query()`` renders the query to HTML via ``markdown``
|
||||||
|
(``fenced_code`` extension), drops ``<pre>`` blocks entirely, unwraps inline
|
||||||
|
``<code>`` to its text (so ``git reset`` survives), collapses whitespace, and
|
||||||
|
truncates.
|
||||||
|
|
||||||
|
The first four tests pin the helper directly; the last three prove it is
|
||||||
|
wired into the production path and that the combined generated-query +
|
||||||
|
sanitization behaviour holds for all three selection outcomes (generated
|
||||||
|
success, LLM exception, empty LLM result).
|
||||||
|
|
||||||
|
This is intentionally a narrow interim/defensive fix for #4547; it does not
|
||||||
|
replace the generated-query flow from #4557.
|
||||||
|
"""
|
||||||
|
from src.chat_processor import ChatProcessor, _clean_search_query
|
||||||
|
|
||||||
|
|
||||||
|
# ── Unit tests: _clean_search_query ──
|
||||||
|
|
||||||
|
|
||||||
|
def test_clean_search_query_removes_fenced_code_blocks():
|
||||||
|
"""A fenced code block must be dropped entirely, including the code body
|
||||||
|
and the fences — only the surrounding prose survives."""
|
||||||
|
message = '```python\nprint("hello")\n```\nWhat is the capital of France?'
|
||||||
|
|
||||||
|
result = _clean_search_query(message)
|
||||||
|
|
||||||
|
assert result == "What is the capital of France?"
|
||||||
|
# Guards against the original leak: no fences, no code body.
|
||||||
|
assert "```" not in result
|
||||||
|
assert "print" not in result
|
||||||
|
|
||||||
|
|
||||||
|
def test_clean_search_query_preserves_inline_code():
|
||||||
|
"""Inline code text is search-relevant and must survive unwrapped; only the
|
||||||
|
backticks are removed. This is the ``git reset`` case the reviewer flagged
|
||||||
|
against the earlier regex approach (which dropped the word entirely)."""
|
||||||
|
message = "Is it a good idea to use `git reset` to undo my changes?"
|
||||||
|
|
||||||
|
result = _clean_search_query(message)
|
||||||
|
|
||||||
|
assert result == "Is it a good idea to use git reset to undo my changes?"
|
||||||
|
assert "git reset" in result
|
||||||
|
assert "`" not in result
|
||||||
|
|
||||||
|
|
||||||
|
def test_clean_search_query_collapses_whitespace():
|
||||||
|
"""Runs of whitespace (tabs, multiple spaces, newlines) collapse to a single
|
||||||
|
space so the query is a single clean line."""
|
||||||
|
message = "hello\tworld foo\n\n bar"
|
||||||
|
|
||||||
|
result = _clean_search_query(message)
|
||||||
|
|
||||||
|
assert result == "hello world foo bar"
|
||||||
|
assert " " not in result
|
||||||
|
assert "\n" not in result
|
||||||
|
assert "\t" not in result
|
||||||
|
|
||||||
|
|
||||||
|
def test_clean_search_query_truncates_long_input():
|
||||||
|
"""Long queries are capped at ``max_len`` (default 200) to stay within search
|
||||||
|
API limits; truncation is a strict prefix of the cleaned text."""
|
||||||
|
long_message = "x" * 300
|
||||||
|
|
||||||
|
result_default = _clean_search_query(long_message)
|
||||||
|
result_custom = _clean_search_query(long_message, max_len=10)
|
||||||
|
|
||||||
|
assert len(result_default) == 200
|
||||||
|
assert result_default == "x" * 200
|
||||||
|
assert result_custom == "x" * 10
|
||||||
|
|
||||||
|
|
||||||
|
# ── Integration tests: the generated-query + sanitization flow ──
|
||||||
|
#
|
||||||
|
# These cover the combined behaviour requested in review of #4863 after #4557
|
||||||
|
# landed: the LLM-generated query is used on success, the first-line fallback is
|
||||||
|
# used when the LLM fails or returns empty, and in every case the *final* query
|
||||||
|
# handed to comprehensive_web_search() is sanitized.
|
||||||
|
|
||||||
|
# A messy user message whose first non-empty line (the #4557 fallback) is
|
||||||
|
# inline-code prose followed by a fenced block. After sanitization the fallback
|
||||||
|
# collapses to plain prose.
|
||||||
|
_MESSY = 'Is `git reset` safe?\n```python\nprint("leaked body")\n```'
|
||||||
|
_SANITIZED_FALLBACK = "Is git reset safe?"
|
||||||
|
|
||||||
|
|
||||||
|
class _Session:
|
||||||
|
"""Minimal stand-in for the session object read by the generated-query
|
||||||
|
flow (endpoint_url / model / headers)."""
|
||||||
|
|
||||||
|
endpoint_url = "http://example.local/v1"
|
||||||
|
model = "test-model"
|
||||||
|
headers = {"Authorization": "Bearer test"}
|
||||||
|
|
||||||
|
|
||||||
|
class _Memory:
|
||||||
|
def load(self, owner=None):
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
class _Docs:
|
||||||
|
rag_manager = None
|
||||||
|
|
||||||
|
|
||||||
|
def _patch_flow(monkeypatch, llm_behaviour, captured):
|
||||||
|
"""Wire both seams of the generated-query flow: the LLM call and the
|
||||||
|
search call. ``llm_behaviour`` is either a string to return or an Exception
|
||||||
|
instance to raise."""
|
||||||
|
|
||||||
|
def _fake_search(query, *args, **kwargs):
|
||||||
|
captured["query"] = query
|
||||||
|
captured["kwargs"] = kwargs
|
||||||
|
return ("web context", [{"title": "src"}])
|
||||||
|
|
||||||
|
def _fake_llm(*args, **kwargs):
|
||||||
|
if isinstance(llm_behaviour, Exception):
|
||||||
|
raise llm_behaviour
|
||||||
|
return llm_behaviour
|
||||||
|
|
||||||
|
monkeypatch.setattr("src.chat_processor.comprehensive_web_search", _fake_search)
|
||||||
|
monkeypatch.setattr("src.llm_core.llm_call", _fake_llm)
|
||||||
|
|
||||||
|
|
||||||
|
def test_generated_query_is_used_and_sanitized(monkeypatch):
|
||||||
|
"""Requirement: on LLM success the generated query wins, and the *final*
|
||||||
|
query handed to comprehensive_web_search() is sanitized.
|
||||||
|
|
||||||
|
The fake LLM returns a query containing inline-code markdown so we can also
|
||||||
|
prove the sanitizer runs on the generated path (not just the fallback)."""
|
||||||
|
captured = {}
|
||||||
|
_patch_flow(monkeypatch, "capital of `France`", captured)
|
||||||
|
|
||||||
|
processor = ChatProcessor(memory_manager=_Memory(), personal_docs_manager=_Docs())
|
||||||
|
preface, _, _ = processor.build_context_preface(
|
||||||
|
message=_MESSY,
|
||||||
|
session=_Session(),
|
||||||
|
use_web=True,
|
||||||
|
use_memory=False,
|
||||||
|
use_rag=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert "query" in captured, "comprehensive_web_search was not called"
|
||||||
|
|
||||||
|
# The generated query won (not the sanitized first-line fallback) ...
|
||||||
|
assert captured["query"] == "capital of France"
|
||||||
|
assert captured["query"] != _SANITIZED_FALLBACK
|
||||||
|
# ... and it was sanitized: no residual markdown fences/backticks.
|
||||||
|
assert "`" not in captured["query"]
|
||||||
|
assert "```" not in captured["query"]
|
||||||
|
|
||||||
|
# The other call-site kwargs (return_sources) are still forwarded.
|
||||||
|
assert captured["kwargs"].get("return_sources") is True
|
||||||
|
# And the retrieved context was still appended to the preface.
|
||||||
|
assert any("web context" in (msg.get("content") or "") for msg in preface)
|
||||||
|
|
||||||
|
|
||||||
|
def test_falls_back_to_sanitized_first_line_when_llm_raises(monkeypatch):
|
||||||
|
"""Requirement: when the LLM call raises, #4557's fallback (first non-empty
|
||||||
|
line) is used — and that fallback is sanitized before the search call."""
|
||||||
|
captured = {}
|
||||||
|
_patch_flow(monkeypatch, RuntimeError("LLM endpoint down"), captured)
|
||||||
|
|
||||||
|
processor = ChatProcessor(memory_manager=_Memory(), personal_docs_manager=_Docs())
|
||||||
|
processor.build_context_preface(
|
||||||
|
message=_MESSY,
|
||||||
|
session=_Session(),
|
||||||
|
use_web=True,
|
||||||
|
use_memory=False,
|
||||||
|
use_rag=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert "query" in captured, "comprehensive_web_search was not called"
|
||||||
|
# Fallback was the first line ("Is `git reset` safe?"), sanitized.
|
||||||
|
assert captured["query"] == _SANITIZED_FALLBACK
|
||||||
|
assert "git reset" in captured["query"] # inline code preserved
|
||||||
|
assert "`" not in captured["query"] # backticks stripped
|
||||||
|
# The fenced body from later lines never reached the query.
|
||||||
|
assert "leaked body" not in captured["query"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_falls_back_to_sanitized_first_line_when_llm_returns_empty(monkeypatch):
|
||||||
|
"""Requirement: when the LLM returns an empty/whitespace-only query, #4557
|
||||||
|
falls back — and that fallback is sanitized before the search call."""
|
||||||
|
captured = {}
|
||||||
|
_patch_flow(monkeypatch, " ", captured)
|
||||||
|
|
||||||
|
processor = ChatProcessor(memory_manager=_Memory(), personal_docs_manager=_Docs())
|
||||||
|
processor.build_context_preface(
|
||||||
|
message=_MESSY,
|
||||||
|
session=_Session(),
|
||||||
|
use_web=True,
|
||||||
|
use_memory=False,
|
||||||
|
use_rag=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert "query" in captured, "comprehensive_web_search was not called"
|
||||||
|
assert captured["query"] == _SANITIZED_FALLBACK
|
||||||
|
assert "git reset" in captured["query"]
|
||||||
|
assert "`" not in captured["query"]
|
||||||
Reference in New Issue
Block a user