fix(chat): sanitize web search query to strip markdown and code blocks (#4863)

Layer a defensive cleanup on top of the generated-query web-search flow so the final selected query is sanitized before reaching comprehensive_web_search.

- remove fenced code blocks from the final search query
- preserve inline code as plain text
- collapse whitespace and cap query length
- cover generated-query success plus LLM failure/empty fallback paths

Partially addresses #4547.
This commit is contained in:
Rudra Sarker
2026-06-28 06:23:08 +06:00
committed by GitHub
parent ff0f1b3450
commit 5b8bfdabab
2 changed files with 255 additions and 0 deletions
+46
View File
@@ -12,6 +12,45 @@ from src.prompt_security import UNTRUSTED_CONTEXT_POLICY, untrusted_context_mess
logger = logging.getLogger(__name__)
def _clean_search_query(query: str, max_len: int = 200) -> str:
"""Strip fenced code blocks from a search query while preserving inline
code text.
This is a focused, defensive cleanup for the *final* web-search query
selected in ``build_context_preface`` (issue #4547): regardless of whether
the query came from the LLM-generated path (#4557) or the first-line
fallback, residual fenced / inline markdown should not leak into the search
call. Rather than using regex (which is brittle and strips inline code
text like ``git reset`` from the query), we render the query to HTML via
``markdown`` and parse it with ``BeautifulSoup`` so that:
* ``<pre>`` blocks (fenced / indented code) are removed entirely.
* ``<code>`` elements (inline code) are preserved as plain text.
Both libraries are already project dependencies. The result is whitespace
collapsed and truncated to ``max_len``; an all-code input collapses to an
empty string, which the caller treats as "no query".
"""
import markdown as _md
from bs4 import BeautifulSoup as _BS
html = _md.markdown(query, extensions=["fenced_code"])
soup = _BS(html, "html.parser")
# Remove fenced / indented code blocks.
for pre in soup.find_all("pre"):
pre.decompose()
# Preserve inline code by unwrapping <code> to text.
for code in soup.find_all("code"):
code.replace_with(code.get_text())
text = soup.get_text(" ", strip=True)
text = re.sub(r"\s+", " ", text)
return text[:max_len]
# ── Stopwords & tokenizer ──
_STOPWORDS = frozenset(
@@ -322,6 +361,13 @@ class ChatProcessor:
if len(search_query) > 150:
search_query = search_query[:150].strip()
# Defensive cleanup of the final selected query (interim fix
# for #4547): strip any residual fenced/inline markdown so that
# neither the generated query nor the first-line fallback leaks
# fences or backticks into the search call. No-op on clean
# generated queries; collapses to "" when the query is all code.
search_query = _clean_search_query(search_query, max_len=150)
if search_query:
# Execute web search using the final selected query
web_context, web_sources = comprehensive_web_search(