From 5b8bfdabab92355523742d5dcd05b4fca048e119 Mon Sep 17 00:00:00 2001 From: Rudra Sarker <78224940+rudra496@users.noreply.github.com> Date: Sun, 28 Jun 2026 06:23:08 +0600 Subject: [PATCH] fix(chat): sanitize web search query to strip markdown and code blocks (#4863) Layer a defensive cleanup on top of the generated-query web-search flow so the final selected query is sanitized before reaching comprehensive_web_search. - remove fenced code blocks from the final search query - preserve inline code as plain text - collapse whitespace and cap query length - cover generated-query success plus LLM failure/empty fallback paths Partially addresses #4547. --- src/chat_processor.py | 46 +++++ tests/test_web_search_query_sanitization.py | 209 ++++++++++++++++++++ 2 files changed, 255 insertions(+) create mode 100644 tests/test_web_search_query_sanitization.py diff --git a/src/chat_processor.py b/src/chat_processor.py index 4fddae0b7..f23740130 100644 --- a/src/chat_processor.py +++ b/src/chat_processor.py @@ -12,6 +12,45 @@ from src.prompt_security import UNTRUSTED_CONTEXT_POLICY, untrusted_context_mess logger = logging.getLogger(__name__) + +def _clean_search_query(query: str, max_len: int = 200) -> str: + """Strip fenced code blocks from a search query while preserving inline + code text. + + This is a focused, defensive cleanup for the *final* web-search query + selected in ``build_context_preface`` (issue #4547): regardless of whether + the query came from the LLM-generated path (#4557) or the first-line + fallback, residual fenced / inline markdown should not leak into the search + call. Rather than using regex (which is brittle and strips inline code + text like ``git reset`` from the query), we render the query to HTML via + ``markdown`` and parse it with ``BeautifulSoup`` so that: + + * ``
`` blocks (fenced / indented code) are removed entirely.
+    * ```` elements (inline code) are preserved as plain text.
+
+    Both libraries are already project dependencies. The result is whitespace
+    collapsed and truncated to ``max_len``; an all-code input collapses to an
+    empty string, which the caller treats as "no query".
+    """
+    import markdown as _md
+    from bs4 import BeautifulSoup as _BS
+
+    html = _md.markdown(query, extensions=["fenced_code"])
+    soup = _BS(html, "html.parser")
+
+    # Remove fenced / indented code blocks.
+    for pre in soup.find_all("pre"):
+        pre.decompose()
+
+    # Preserve inline code by unwrapping  to text.
+    for code in soup.find_all("code"):
+        code.replace_with(code.get_text())
+
+    text = soup.get_text(" ", strip=True)
+    text = re.sub(r"\s+", " ", text)
+    return text[:max_len]
+
+
 # ── Stopwords & tokenizer ──
 
 _STOPWORDS = frozenset(
@@ -322,6 +361,13 @@ class ChatProcessor:
                 if len(search_query) > 150:
                     search_query = search_query[:150].strip()
 
+                # Defensive cleanup of the final selected query (interim fix
+                # for #4547): strip any residual fenced/inline markdown so that
+                # neither the generated query nor the first-line fallback leaks
+                # fences or backticks into the search call. No-op on clean
+                # generated queries; collapses to "" when the query is all code.
+                search_query = _clean_search_query(search_query, max_len=150)
+
                 if search_query:
                     # Execute web search using the final selected query
                     web_context, web_sources = comprehensive_web_search(
diff --git a/tests/test_web_search_query_sanitization.py b/tests/test_web_search_query_sanitization.py
new file mode 100644
index 000000000..58e338233
--- /dev/null
+++ b/tests/test_web_search_query_sanitization.py
@@ -0,0 +1,209 @@
+"""Regression tests for #4547 — chat-mode web search query sanitization.
+
+Chat-mode web search (``use_web``) selects a search query via the
+generated-query flow added in #4557: an LLM extracts a concise query, falling
+back to the first non-empty line of the user message when the LLM fails or
+returns an empty result. PR #4863 layers a focused, *defensive* cleanup on top
+of that flow: whatever query is finally selected (generated or fallback) is
+passed through ``_clean_search_query()`` before reaching
+``comprehensive_web_search()``, so residual fenced/inline markdown never leaks
+into the search call.
+
+``_clean_search_query()`` renders the query to HTML via ``markdown``
+(``fenced_code`` extension), drops ``
`` blocks entirely, unwraps inline
+```` to its text (so ``git reset`` survives), collapses whitespace, and
+truncates.
+
+The first four tests pin the helper directly; the last three prove it is
+wired into the production path and that the combined generated-query +
+sanitization behaviour holds for all three selection outcomes (generated
+success, LLM exception, empty LLM result).
+
+This is intentionally a narrow interim/defensive fix for #4547; it does not
+replace the generated-query flow from #4557.
+"""
+from src.chat_processor import ChatProcessor, _clean_search_query
+
+
+# ── Unit tests: _clean_search_query ──
+
+
+def test_clean_search_query_removes_fenced_code_blocks():
+    """A fenced code block must be dropped entirely, including the code body
+    and the fences — only the surrounding prose survives."""
+    message = '```python\nprint("hello")\n```\nWhat is the capital of France?'
+
+    result = _clean_search_query(message)
+
+    assert result == "What is the capital of France?"
+    # Guards against the original leak: no fences, no code body.
+    assert "```" not in result
+    assert "print" not in result
+
+
+def test_clean_search_query_preserves_inline_code():
+    """Inline code text is search-relevant and must survive unwrapped; only the
+    backticks are removed. This is the ``git reset`` case the reviewer flagged
+    against the earlier regex approach (which dropped the word entirely)."""
+    message = "Is it a good idea to use `git reset` to undo my changes?"
+
+    result = _clean_search_query(message)
+
+    assert result == "Is it a good idea to use git reset to undo my changes?"
+    assert "git reset" in result
+    assert "`" not in result
+
+
+def test_clean_search_query_collapses_whitespace():
+    """Runs of whitespace (tabs, multiple spaces, newlines) collapse to a single
+    space so the query is a single clean line."""
+    message = "hello\tworld   foo\n\n   bar"
+
+    result = _clean_search_query(message)
+
+    assert result == "hello world foo bar"
+    assert "  " not in result
+    assert "\n" not in result
+    assert "\t" not in result
+
+
+def test_clean_search_query_truncates_long_input():
+    """Long queries are capped at ``max_len`` (default 200) to stay within search
+    API limits; truncation is a strict prefix of the cleaned text."""
+    long_message = "x" * 300
+
+    result_default = _clean_search_query(long_message)
+    result_custom = _clean_search_query(long_message, max_len=10)
+
+    assert len(result_default) == 200
+    assert result_default == "x" * 200
+    assert result_custom == "x" * 10
+
+
+# ── Integration tests: the generated-query + sanitization flow ──
+#
+# These cover the combined behaviour requested in review of #4863 after #4557
+# landed: the LLM-generated query is used on success, the first-line fallback is
+# used when the LLM fails or returns empty, and in every case the *final* query
+# handed to comprehensive_web_search() is sanitized.
+
+# A messy user message whose first non-empty line (the #4557 fallback) is
+# inline-code prose followed by a fenced block. After sanitization the fallback
+# collapses to plain prose.
+_MESSY = 'Is `git reset` safe?\n```python\nprint("leaked body")\n```'
+_SANITIZED_FALLBACK = "Is git reset safe?"
+
+
+class _Session:
+    """Minimal stand-in for the session object read by the generated-query
+    flow (endpoint_url / model / headers)."""
+
+    endpoint_url = "http://example.local/v1"
+    model = "test-model"
+    headers = {"Authorization": "Bearer test"}
+
+
+class _Memory:
+    def load(self, owner=None):
+        return []
+
+
+class _Docs:
+    rag_manager = None
+
+
+def _patch_flow(monkeypatch, llm_behaviour, captured):
+    """Wire both seams of the generated-query flow: the LLM call and the
+    search call. ``llm_behaviour`` is either a string to return or an Exception
+    instance to raise."""
+
+    def _fake_search(query, *args, **kwargs):
+        captured["query"] = query
+        captured["kwargs"] = kwargs
+        return ("web context", [{"title": "src"}])
+
+    def _fake_llm(*args, **kwargs):
+        if isinstance(llm_behaviour, Exception):
+            raise llm_behaviour
+        return llm_behaviour
+
+    monkeypatch.setattr("src.chat_processor.comprehensive_web_search", _fake_search)
+    monkeypatch.setattr("src.llm_core.llm_call", _fake_llm)
+
+
+def test_generated_query_is_used_and_sanitized(monkeypatch):
+    """Requirement: on LLM success the generated query wins, and the *final*
+    query handed to comprehensive_web_search() is sanitized.
+
+    The fake LLM returns a query containing inline-code markdown so we can also
+    prove the sanitizer runs on the generated path (not just the fallback)."""
+    captured = {}
+    _patch_flow(monkeypatch, "capital of `France`", captured)
+
+    processor = ChatProcessor(memory_manager=_Memory(), personal_docs_manager=_Docs())
+    preface, _, _ = processor.build_context_preface(
+        message=_MESSY,
+        session=_Session(),
+        use_web=True,
+        use_memory=False,
+        use_rag=False,
+    )
+
+    assert "query" in captured, "comprehensive_web_search was not called"
+
+    # The generated query won (not the sanitized first-line fallback) ...
+    assert captured["query"] == "capital of France"
+    assert captured["query"] != _SANITIZED_FALLBACK
+    # ... and it was sanitized: no residual markdown fences/backticks.
+    assert "`" not in captured["query"]
+    assert "```" not in captured["query"]
+
+    # The other call-site kwargs (return_sources) are still forwarded.
+    assert captured["kwargs"].get("return_sources") is True
+    # And the retrieved context was still appended to the preface.
+    assert any("web context" in (msg.get("content") or "") for msg in preface)
+
+
+def test_falls_back_to_sanitized_first_line_when_llm_raises(monkeypatch):
+    """Requirement: when the LLM call raises, #4557's fallback (first non-empty
+    line) is used — and that fallback is sanitized before the search call."""
+    captured = {}
+    _patch_flow(monkeypatch, RuntimeError("LLM endpoint down"), captured)
+
+    processor = ChatProcessor(memory_manager=_Memory(), personal_docs_manager=_Docs())
+    processor.build_context_preface(
+        message=_MESSY,
+        session=_Session(),
+        use_web=True,
+        use_memory=False,
+        use_rag=False,
+    )
+
+    assert "query" in captured, "comprehensive_web_search was not called"
+    # Fallback was the first line ("Is `git reset` safe?"), sanitized.
+    assert captured["query"] == _SANITIZED_FALLBACK
+    assert "git reset" in captured["query"]  # inline code preserved
+    assert "`" not in captured["query"]  # backticks stripped
+    # The fenced body from later lines never reached the query.
+    assert "leaked body" not in captured["query"]
+
+
+def test_falls_back_to_sanitized_first_line_when_llm_returns_empty(monkeypatch):
+    """Requirement: when the LLM returns an empty/whitespace-only query, #4557
+    falls back — and that fallback is sanitized before the search call."""
+    captured = {}
+    _patch_flow(monkeypatch, "   ", captured)
+
+    processor = ChatProcessor(memory_manager=_Memory(), personal_docs_manager=_Docs())
+    processor.build_context_preface(
+        message=_MESSY,
+        session=_Session(),
+        use_web=True,
+        use_memory=False,
+        use_rag=False,
+    )
+
+    assert "query" in captured, "comprehensive_web_search was not called"
+    assert captured["query"] == _SANITIZED_FALLBACK
+    assert "git reset" in captured["query"]
+    assert "`" not in captured["query"]