mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-17 10:15:27 -04:00
fix(deep-research): wrap fetched webpage content in untrusted-context sandbox
The goal-based extractor passed raw fetched webpage content straight into the LLM prompt via string substitution, bypassing the prompt-injection hardening layer in src/prompt_security.py. Split EXTRACTOR_PROMPT into EXTRACTOR_SYSTEM (task instructions + goal, trusted) and a second message built with untrusted_context_message() (raw page content, sandboxed with <<<UNTRUSTED_SOURCE_DATA>>> guards). This aligns the extractor with every other external-content injection site in the codebase (agent_loop, chat_processor, chat_routes). Fixes #3044 Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -16,7 +16,8 @@ from typing import Callable, Dict, List, Optional, Set
|
|||||||
|
|
||||||
from src.research_utils import strip_thinking, is_low_quality
|
from src.research_utils import strip_thinking, is_low_quality
|
||||||
|
|
||||||
from src.goal_based_extractor import EXTRACTOR_PROMPT
|
from src.goal_based_extractor import EXTRACTOR_SYSTEM
|
||||||
|
from src.prompt_security import untrusted_context_message
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -625,11 +626,12 @@ class DeepResearcher:
|
|||||||
else:
|
else:
|
||||||
content = truncated
|
content = truncated
|
||||||
|
|
||||||
prompt = EXTRACTOR_PROMPT.format(webpage_content=content, goal=question)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = await self._llm(
|
response = await self._llm(
|
||||||
[{"role": "user", "content": prompt}],
|
[
|
||||||
|
{"role": "user", "content": EXTRACTOR_SYSTEM.format(goal=question)},
|
||||||
|
untrusted_context_message("webpage", content),
|
||||||
|
],
|
||||||
temperature=0.2,
|
temperature=0.2,
|
||||||
max_tokens=2048,
|
max_tokens=2048,
|
||||||
timeout=self.extraction_timeout,
|
timeout=self.extraction_timeout,
|
||||||
|
|||||||
@@ -3,22 +3,18 @@
|
|||||||
Goal-based content extraction prompt inspired by Alibaba Tongyi DeepResearch.
|
Goal-based content extraction prompt inspired by Alibaba Tongyi DeepResearch.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
EXTRACTOR_PROMPT = """Please process the following webpage content and user goal to extract relevant information:
|
EXTRACTOR_SYSTEM = """Extract relevant information from a webpage for a given research goal.
|
||||||
|
|
||||||
## **Webpage Content**
|
Goal: {goal}
|
||||||
{webpage_content}
|
|
||||||
|
|
||||||
## **User Goal**
|
Task guidelines:
|
||||||
{goal}
|
1. Locate the specific sections directly related to the goal within the provided webpage content.
|
||||||
|
2. Identify and extract the most relevant information; output full original context where possible, up to three or more paragraphs.
|
||||||
|
3. Organize into a concise paragraph with logical flow, judging each piece of information's contribution to the goal.
|
||||||
|
|
||||||
## **Task Guidelines**
|
Respond in JSON with exactly these fields: "rational", "evidence", "summary".
|
||||||
1. **Content Scanning for Rational**: Locate the **specific sections/data** directly related to the user's goal within the webpage content
|
|
||||||
2. **Key Extraction for Evidence**: Identify and extract the **most relevant information** from the content, you never miss any important information, output the **full original context** of the content as far as possible, it can be more than three paragraphs.
|
|
||||||
3. **Summary Output for Summary**: Organize into a concise paragraph with logical flow, prioritizing clarity and judge the contribution of the information to the goal.
|
|
||||||
|
|
||||||
**Final Output Format using JSON format has "rational", "evidence", "summary" fields**
|
Example:
|
||||||
|
|
||||||
Example output:
|
|
||||||
{{
|
{{
|
||||||
"rational": "This section discusses X which directly relates to the goal of understanding Y",
|
"rational": "This section discusses X which directly relates to the goal of understanding Y",
|
||||||
"evidence": "Full quotes and context from the page...",
|
"evidence": "Full quotes and context from the page...",
|
||||||
|
|||||||
Reference in New Issue
Block a user