From 6d64055328cdd8a1608470ed174116de4dd06871 Mon Sep 17 00:00:00 2001
From: Vykos <illjaesterhazy@gmail.com>
Date: Fri, 5 Jun 2026 13:20:02 +0200
Subject: [PATCH] Constrain research handler JSON paths (#2846)

---
 src/research_handler.py                       | 57 ++++++++++---
 .../test_research_handler_path_confinement.py | 83 +++++++++++++++++++
 2 files changed, 130 insertions(+), 10 deletions(-)
 create mode 100644 tests/test_research_handler_path_confinement.py

diff --git a/src/research_handler.py b/src/research_handler.py
index bec9695ec..4a721cd5e 100644
--- a/src/research_handler.py
+++ b/src/research_handler.py
@@ -20,6 +20,7 @@ from src.research_utils import strip_thinking, is_low_quality
 logger = logging.getLogger(__name__)
 
 RESEARCH_DATA_DIR = Path("data/deep_research")
+_RESEARCH_SESSION_ID_RE = re.compile(r"^[A-Za-z0-9-]{1,128}$")
 
 
 def _bounded_int(value, *, default: int, minimum: int, maximum: int) -> int:
@@ -48,6 +49,18 @@ def _format_probe_failure(model: str, exc: Exception) -> str:
     return f"Cannot reach model '{model}' — check that the endpoint is running and accessible."
 
 
+def _research_json_path(session_id: str) -> Optional[Path]:
+    if not isinstance(session_id, str) or not _RESEARCH_SESSION_ID_RE.fullmatch(session_id):
+        return None
+    root = RESEARCH_DATA_DIR.resolve()
+    path = (RESEARCH_DATA_DIR / f"{session_id}.json").resolve()
+    try:
+        path.relative_to(root)
+    except ValueError:
+        return None
+    return path
+
+
 class ResearchHandler:
     """Handles research service operations with iterative deep research."""
 
@@ -232,6 +245,9 @@ class ResearchHandler:
         max_rounds is the safety cap; the AI's _should_stop decision (after
         min_rounds) terminates the loop earlier in normal operation.
         """
+        if _research_json_path(session_id) is None:
+            raise ValueError("Invalid research session_id")
+
         # Resolve the hard wall-clock timeout from settings when the caller
         # didn't pin one. Local / edge models routinely need more than the
         # old 600s default to finish a deep-research synthesis. A setting of
@@ -368,7 +384,9 @@ class ResearchHandler:
                 result["avg_duration"] = round(avg, 1)
             return result
         # Check disk for completed research (skip consumed results)
-        path = RESEARCH_DATA_DIR / f"{session_id}.json"
+        path = _research_json_path(session_id)
+        if path is None:
+            return None
         if path.exists():
             try:
                 data = json.loads(path.read_text(encoding="utf-8"))
@@ -407,7 +425,9 @@ class ResearchHandler:
             if entry["status"] in ("done", "error", "cancelled"):
                 return entry.get("result")
         # Check disk (skip consumed results)
-        path = RESEARCH_DATA_DIR / f"{session_id}.json"
+        path = _research_json_path(session_id)
+        if path is None:
+            return None
         if path.exists():
             try:
                 data = json.loads(path.read_text(encoding="utf-8"))
@@ -429,7 +449,9 @@ class ResearchHandler:
             if researcher and researcher.findings:
                 return self._extract_sources(researcher.findings)
         # Check disk
-        path = RESEARCH_DATA_DIR / f"{session_id}.json"
+        path = _research_json_path(session_id)
+        if path is None:
+            return None
         if path.exists():
             try:
                 data = json.loads(path.read_text(encoding="utf-8"))
@@ -446,7 +468,9 @@ class ResearchHandler:
             if researcher and researcher.findings:
                 return self._extract_raw_findings(researcher.findings)
         # Check disk
-        path = RESEARCH_DATA_DIR / f"{session_id}.json"
+        path = _research_json_path(session_id)
+        if path is None:
+            return None
         if path.exists():
             try:
                 data = json.loads(path.read_text(encoding="utf-8"))
@@ -521,7 +545,9 @@ class ResearchHandler:
         Keeps the JSON on disk so visual reports can be generated later.
         """
         self._active_tasks.pop(session_id, None)
-        path = RESEARCH_DATA_DIR / f"{session_id}.json"
+        path = _research_json_path(session_id)
+        if path is None:
+            return
         if path.exists():
             try:
                 data = json.loads(path.read_text(encoding="utf-8"))
@@ -533,6 +559,10 @@ class ResearchHandler:
     def _save_result(self, session_id: str, entry: dict):
         """Persist completed research result to disk."""
         try:
+            path = _research_json_path(session_id)
+            if path is None:
+                logger.error("Refusing to save research result for invalid session_id: %r", session_id)
+                return
             # Extract and cache sources + raw findings
             sources = []
             raw_findings = []
@@ -542,7 +572,6 @@ class ResearchHandler:
                 raw_findings = self._extract_raw_findings(researcher.findings)
             entry["sources"] = sources
 
-            path = RESEARCH_DATA_DIR / f"{session_id}.json"
             data = {
                 "query": entry["query"],
                 "status": entry["status"],
@@ -569,7 +598,9 @@ class ResearchHandler:
 
     def _get_session_json(self, session_id: str) -> Optional[dict]:
         """Load the saved research JSON for a session, if it exists."""
-        path = RESEARCH_DATA_DIR / f"{session_id}.json"
+        path = _research_json_path(session_id)
+        if path is None:
+            return None
         if path.exists():
             try:
                 return json.loads(path.read_text(encoding="utf-8"))
@@ -579,7 +610,9 @@ class ResearchHandler:
 
     def get_report_html(self, session_id: str) -> Optional[str]:
         """Generate the visual HTML report for a session (always fresh from JSON)."""
-        json_path = RESEARCH_DATA_DIR / f"{session_id}.json"
+        json_path = _research_json_path(session_id)
+        if json_path is None:
+            return None
         if not json_path.exists():
             logger.warning(f"No JSON found for visual report: {json_path}")
             return None
@@ -606,7 +639,9 @@ class ResearchHandler:
 
     def hide_image(self, session_id: str, image_url: str) -> bool:
         """Add image_url to the persisted hidden_images list for a research."""
-        path = RESEARCH_DATA_DIR / f"{session_id}.json"
+        path = _research_json_path(session_id)
+        if path is None:
+            return False
         if not path.exists():
             return False
         try:
@@ -624,7 +659,9 @@ class ResearchHandler:
 
     def unhide_all_images(self, session_id: str) -> bool:
         """Clear the hidden_images list for a research."""
-        path = RESEARCH_DATA_DIR / f"{session_id}.json"
+        path = _research_json_path(session_id)
+        if path is None:
+            return False
         if not path.exists():
             return False
         try:
diff --git a/tests/test_research_handler_path_confinement.py b/tests/test_research_handler_path_confinement.py
new file mode 100644
index 000000000..5682a522e
--- /dev/null
+++ b/tests/test_research_handler_path_confinement.py
@@ -0,0 +1,83 @@
+import json
+
+import pytest
+
+from src import research_handler
+from src.research_handler import ResearchHandler
+
+
+def _handler():
+    handler = ResearchHandler.__new__(ResearchHandler)
+    handler._active_tasks = {}
+    return handler
+
+
+def test_research_json_path_allows_safe_ids(tmp_path, monkeypatch):
+    data_dir = tmp_path / "deep_research"
+    monkeypatch.setattr(research_handler, "RESEARCH_DATA_DIR", data_dir)
+
+    path = research_handler._research_json_path("rp-abc123")
+
+    assert path == (data_dir / "rp-abc123.json").resolve()
+
+
+@pytest.mark.parametrize("session_id", ["../escape", "..", "rp/test", "rp_test", "", None])
+def test_research_json_path_rejects_invalid_ids(tmp_path, monkeypatch, session_id):
+    monkeypatch.setattr(research_handler, "RESEARCH_DATA_DIR", tmp_path / "deep_research")
+
+    assert research_handler._research_json_path(session_id) is None
+
+
+def test_research_json_path_rejects_symlink_escape(tmp_path, monkeypatch):
+    data_dir = tmp_path / "deep_research"
+    outside = tmp_path / "outside"
+    data_dir.mkdir()
+    outside.mkdir()
+    monkeypatch.setattr(research_handler, "RESEARCH_DATA_DIR", data_dir)
+    link = data_dir / "rp-abc123.json"
+    target = outside / "rp-abc123.json"
+    target.write_text("{}", encoding="utf-8")
+    try:
+        link.symlink_to(target)
+    except (AttributeError, NotImplementedError, OSError) as exc:
+        pytest.skip(f"symlinks unavailable: {exc}")
+
+    assert research_handler._research_json_path("rp-abc123") is None
+
+
+def test_handler_disk_read_methods_reject_invalid_ids(tmp_path, monkeypatch):
+    outside = tmp_path / "escape.json"
+    outside.write_text(json.dumps({"result": "secret"}), encoding="utf-8")
+    monkeypatch.setattr(research_handler, "RESEARCH_DATA_DIR", tmp_path / "deep_research")
+    handler = _handler()
+
+    assert handler.get_status("../escape") is None
+    assert handler.get_result("../escape") is None
+    assert handler.get_sources("../escape") is None
+    assert handler.get_raw_findings("../escape") is None
+    assert handler._get_session_json("../escape") is None
+    assert handler.get_report_html("../escape") is None
+
+
+def test_handler_mutations_reject_invalid_ids_without_touching_outside_files(tmp_path, monkeypatch):
+    outside = tmp_path / "escape.json"
+    outside.write_text(json.dumps({"result": "secret", "hidden_images": ["x"]}), encoding="utf-8")
+    monkeypatch.setattr(research_handler, "RESEARCH_DATA_DIR", tmp_path / "deep_research")
+    handler = _handler()
+
+    assert handler.hide_image("../escape", "https://example.com/image.png") is False
+    assert handler.unhide_all_images("../escape") is False
+    handler.clear_result("../escape")
+    handler._save_result("../escape", {"query": "q", "status": "done", "result": "r", "started_at": 1})
+
+    assert json.loads(outside.read_text(encoding="utf-8")) == {
+        "result": "secret",
+        "hidden_images": ["x"],
+    }
+
+
+def test_start_research_rejects_invalid_session_id():
+    handler = _handler()
+
+    with pytest.raises(ValueError):
+        handler.start_research("../escape", "q", "http://localhost", "model")