fix(deep-research): wrap fetched webpage content in untrusted-context sandbox

The goal-based extractor passed raw fetched webpage content straight into the LLM prompt via string substitution, bypassing the prompt-injection hardening layer in src/prompt_security.py. Split EXTRACTOR_PROMPT into EXTRACTOR_SYSTEM (task instructions + goal, trusted) and a second message built with untrusted_context_message() (raw page content, sandboxed with <<<UNTRUSTED_SOURCE_DATA>>> guards). This aligns the extractor with every other external-content injection site in the codebase (agent_loop, chat_processor, chat_routes). Fixes #3044 Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-19 19:25:27 -04:00 · 2026-06-06 11:37:10 +02:00
parent 893cb8254f
commit e87a1ad8d2
2 changed files with 14 additions and 16 deletions
@@ -16,7 +16,8 @@ from typing import Callable, Dict, List, Optional, Set

 from src.research_utils import strip_thinking, is_low_quality

-from src.goal_based_extractor import EXTRACTOR_PROMPT
+from src.goal_based_extractor import EXTRACTOR_SYSTEM
+from src.prompt_security import untrusted_context_message

 logger = logging.getLogger(__name__)

@@ -625,11 +626,12 @@ class DeepResearcher:
            else:
                content = truncated

-        prompt = EXTRACTOR_PROMPT.format(webpage_content=content, goal=question)
-
        try:
            response = await self._llm(
-                [{"role": "user", "content": prompt}],
+                [
+                    {"role": "user", "content": EXTRACTOR_SYSTEM.format(goal=question)},
+                    untrusted_context_message("webpage", content),
+                ],
                temperature=0.2,
                max_tokens=2048,
                timeout=self.extraction_timeout,