From 6d64055328cdd8a1608470ed174116de4dd06871 Mon Sep 17 00:00:00 2001 From: Vykos Date: Fri, 5 Jun 2026 13:20:02 +0200 Subject: [PATCH] Constrain research handler JSON paths (#2846) --- src/research_handler.py | 57 ++++++++++--- .../test_research_handler_path_confinement.py | 83 +++++++++++++++++++ 2 files changed, 130 insertions(+), 10 deletions(-) create mode 100644 tests/test_research_handler_path_confinement.py diff --git a/src/research_handler.py b/src/research_handler.py index bec9695ec..4a721cd5e 100644 --- a/src/research_handler.py +++ b/src/research_handler.py @@ -20,6 +20,7 @@ from src.research_utils import strip_thinking, is_low_quality logger = logging.getLogger(__name__) RESEARCH_DATA_DIR = Path("data/deep_research") +_RESEARCH_SESSION_ID_RE = re.compile(r"^[A-Za-z0-9-]{1,128}$") def _bounded_int(value, *, default: int, minimum: int, maximum: int) -> int: @@ -48,6 +49,18 @@ def _format_probe_failure(model: str, exc: Exception) -> str: return f"Cannot reach model '{model}' — check that the endpoint is running and accessible." +def _research_json_path(session_id: str) -> Optional[Path]: + if not isinstance(session_id, str) or not _RESEARCH_SESSION_ID_RE.fullmatch(session_id): + return None + root = RESEARCH_DATA_DIR.resolve() + path = (RESEARCH_DATA_DIR / f"{session_id}.json").resolve() + try: + path.relative_to(root) + except ValueError: + return None + return path + + class ResearchHandler: """Handles research service operations with iterative deep research.""" @@ -232,6 +245,9 @@ class ResearchHandler: max_rounds is the safety cap; the AI's _should_stop decision (after min_rounds) terminates the loop earlier in normal operation. """ + if _research_json_path(session_id) is None: + raise ValueError("Invalid research session_id") + # Resolve the hard wall-clock timeout from settings when the caller # didn't pin one. Local / edge models routinely need more than the # old 600s default to finish a deep-research synthesis. A setting of @@ -368,7 +384,9 @@ class ResearchHandler: result["avg_duration"] = round(avg, 1) return result # Check disk for completed research (skip consumed results) - path = RESEARCH_DATA_DIR / f"{session_id}.json" + path = _research_json_path(session_id) + if path is None: + return None if path.exists(): try: data = json.loads(path.read_text(encoding="utf-8")) @@ -407,7 +425,9 @@ class ResearchHandler: if entry["status"] in ("done", "error", "cancelled"): return entry.get("result") # Check disk (skip consumed results) - path = RESEARCH_DATA_DIR / f"{session_id}.json" + path = _research_json_path(session_id) + if path is None: + return None if path.exists(): try: data = json.loads(path.read_text(encoding="utf-8")) @@ -429,7 +449,9 @@ class ResearchHandler: if researcher and researcher.findings: return self._extract_sources(researcher.findings) # Check disk - path = RESEARCH_DATA_DIR / f"{session_id}.json" + path = _research_json_path(session_id) + if path is None: + return None if path.exists(): try: data = json.loads(path.read_text(encoding="utf-8")) @@ -446,7 +468,9 @@ class ResearchHandler: if researcher and researcher.findings: return self._extract_raw_findings(researcher.findings) # Check disk - path = RESEARCH_DATA_DIR / f"{session_id}.json" + path = _research_json_path(session_id) + if path is None: + return None if path.exists(): try: data = json.loads(path.read_text(encoding="utf-8")) @@ -521,7 +545,9 @@ class ResearchHandler: Keeps the JSON on disk so visual reports can be generated later. """ self._active_tasks.pop(session_id, None) - path = RESEARCH_DATA_DIR / f"{session_id}.json" + path = _research_json_path(session_id) + if path is None: + return if path.exists(): try: data = json.loads(path.read_text(encoding="utf-8")) @@ -533,6 +559,10 @@ class ResearchHandler: def _save_result(self, session_id: str, entry: dict): """Persist completed research result to disk.""" try: + path = _research_json_path(session_id) + if path is None: + logger.error("Refusing to save research result for invalid session_id: %r", session_id) + return # Extract and cache sources + raw findings sources = [] raw_findings = [] @@ -542,7 +572,6 @@ class ResearchHandler: raw_findings = self._extract_raw_findings(researcher.findings) entry["sources"] = sources - path = RESEARCH_DATA_DIR / f"{session_id}.json" data = { "query": entry["query"], "status": entry["status"], @@ -569,7 +598,9 @@ class ResearchHandler: def _get_session_json(self, session_id: str) -> Optional[dict]: """Load the saved research JSON for a session, if it exists.""" - path = RESEARCH_DATA_DIR / f"{session_id}.json" + path = _research_json_path(session_id) + if path is None: + return None if path.exists(): try: return json.loads(path.read_text(encoding="utf-8")) @@ -579,7 +610,9 @@ class ResearchHandler: def get_report_html(self, session_id: str) -> Optional[str]: """Generate the visual HTML report for a session (always fresh from JSON).""" - json_path = RESEARCH_DATA_DIR / f"{session_id}.json" + json_path = _research_json_path(session_id) + if json_path is None: + return None if not json_path.exists(): logger.warning(f"No JSON found for visual report: {json_path}") return None @@ -606,7 +639,9 @@ class ResearchHandler: def hide_image(self, session_id: str, image_url: str) -> bool: """Add image_url to the persisted hidden_images list for a research.""" - path = RESEARCH_DATA_DIR / f"{session_id}.json" + path = _research_json_path(session_id) + if path is None: + return False if not path.exists(): return False try: @@ -624,7 +659,9 @@ class ResearchHandler: def unhide_all_images(self, session_id: str) -> bool: """Clear the hidden_images list for a research.""" - path = RESEARCH_DATA_DIR / f"{session_id}.json" + path = _research_json_path(session_id) + if path is None: + return False if not path.exists(): return False try: diff --git a/tests/test_research_handler_path_confinement.py b/tests/test_research_handler_path_confinement.py new file mode 100644 index 000000000..5682a522e --- /dev/null +++ b/tests/test_research_handler_path_confinement.py @@ -0,0 +1,83 @@ +import json + +import pytest + +from src import research_handler +from src.research_handler import ResearchHandler + + +def _handler(): + handler = ResearchHandler.__new__(ResearchHandler) + handler._active_tasks = {} + return handler + + +def test_research_json_path_allows_safe_ids(tmp_path, monkeypatch): + data_dir = tmp_path / "deep_research" + monkeypatch.setattr(research_handler, "RESEARCH_DATA_DIR", data_dir) + + path = research_handler._research_json_path("rp-abc123") + + assert path == (data_dir / "rp-abc123.json").resolve() + + +@pytest.mark.parametrize("session_id", ["../escape", "..", "rp/test", "rp_test", "", None]) +def test_research_json_path_rejects_invalid_ids(tmp_path, monkeypatch, session_id): + monkeypatch.setattr(research_handler, "RESEARCH_DATA_DIR", tmp_path / "deep_research") + + assert research_handler._research_json_path(session_id) is None + + +def test_research_json_path_rejects_symlink_escape(tmp_path, monkeypatch): + data_dir = tmp_path / "deep_research" + outside = tmp_path / "outside" + data_dir.mkdir() + outside.mkdir() + monkeypatch.setattr(research_handler, "RESEARCH_DATA_DIR", data_dir) + link = data_dir / "rp-abc123.json" + target = outside / "rp-abc123.json" + target.write_text("{}", encoding="utf-8") + try: + link.symlink_to(target) + except (AttributeError, NotImplementedError, OSError) as exc: + pytest.skip(f"symlinks unavailable: {exc}") + + assert research_handler._research_json_path("rp-abc123") is None + + +def test_handler_disk_read_methods_reject_invalid_ids(tmp_path, monkeypatch): + outside = tmp_path / "escape.json" + outside.write_text(json.dumps({"result": "secret"}), encoding="utf-8") + monkeypatch.setattr(research_handler, "RESEARCH_DATA_DIR", tmp_path / "deep_research") + handler = _handler() + + assert handler.get_status("../escape") is None + assert handler.get_result("../escape") is None + assert handler.get_sources("../escape") is None + assert handler.get_raw_findings("../escape") is None + assert handler._get_session_json("../escape") is None + assert handler.get_report_html("../escape") is None + + +def test_handler_mutations_reject_invalid_ids_without_touching_outside_files(tmp_path, monkeypatch): + outside = tmp_path / "escape.json" + outside.write_text(json.dumps({"result": "secret", "hidden_images": ["x"]}), encoding="utf-8") + monkeypatch.setattr(research_handler, "RESEARCH_DATA_DIR", tmp_path / "deep_research") + handler = _handler() + + assert handler.hide_image("../escape", "https://example.com/image.png") is False + assert handler.unhide_all_images("../escape") is False + handler.clear_result("../escape") + handler._save_result("../escape", {"query": "q", "status": "done", "result": "r", "started_at": 1}) + + assert json.loads(outside.read_text(encoding="utf-8")) == { + "result": "secret", + "hidden_images": ["x"], + } + + +def test_start_research_rejects_invalid_session_id(): + handler = _handler() + + with pytest.raises(ValueError): + handler.start_research("../escape", "q", "http://localhost", "model")