diff --git a/routes/email_routes.py b/routes/email_routes.py index 1a45eaf00..77be0cdeb 100644 --- a/routes/email_routes.py +++ b/routes/email_routes.py @@ -1645,13 +1645,12 @@ def setup_email_routes(): return {"error": f"Attachment index {index} not found"} from pathlib import Path as _Path - target_root = _Path(target_dir).resolve() - filepath = _Path(filepath).resolve() - try: - filepath.relative_to(target_root) - except ValueError: + target_root = os.path.abspath(str(target_dir)) + filepath_str = os.path.abspath(str(filepath)) + if os.path.commonpath([target_root, filepath_str]) != target_root: logger.warning("Rejected attachment path outside extraction dir: %s", filepath) return {"error": "Invalid attachment path"} + filepath = _Path(filepath_str) base = _Path(filepath).name if base.startswith("."): return {"error": "Invalid filename", "filename": base} @@ -1728,8 +1727,7 @@ def setup_email_routes(): _tag_doc_with_source(doc_id) return doc_id - def _attached_email_markdown(path): - raw_bytes = path.read_bytes() + def _attached_email_markdown(raw_bytes: bytes): if not raw_bytes: return f"# Attached email: {base}\n\n_(empty email attachment)_" try: @@ -1814,9 +1812,32 @@ def setup_email_routes(): # ── Attached email (.eml / message/rfc822) ──────────────── if ext == ".eml": + def _attachment_bytes_from_msg(): + if not msg.is_multipart(): + return b"" + idx = 0 + for part in msg.walk(): + cd = str(part.get("Content-Disposition", "")) + ct = part.get_content_type() + is_attached_email = ct == "message/rfc822" and ("attachment" in cd.lower() or part.get_filename()) + if part.is_multipart() and not is_attached_email: + continue + if ct in ("text/plain", "text/html") and "attachment" not in cd: + continue + if idx == index: + payload = part.get_payload(decode=True) + if payload is None and ct == "message/rfc822": + try: + payload = part.as_bytes() + except Exception: + payload = b"" + return payload or b"" + idx += 1 + return b"" + try: - content = _attached_email_markdown(filepath) - except Exception as e: + content = _attached_email_markdown(_attachment_bytes_from_msg()) + except Exception: logger.exception("Failed to read email attachment %s", base) return {"error": "Failed to read email attachment", "filename": base} doc_id = _create_markdown_doc(content, "Imported attached email") diff --git a/src/tool_parsing.py b/src/tool_parsing.py index d140ee43b..4b19d9236 100644 --- a/src/tool_parsing.py +++ b/src/tool_parsing.py @@ -545,6 +545,28 @@ def _strip_stepfun_tool_markup(text: str) -> str: return cleaned.replace(_STEPFUN_CALLS_BEGIN, "").replace(_STEPFUN_CALLS_END, "") +def _strip_bare_invoke_markup(text: str) -> str: + """Remove bare ... blocks without regex backtracking.""" + out = [] + pos = 0 + while True: + start = text.lower().find("", start) + if tag_end < 0: + out.append(text[pos:]) + break + close = text.lower().find("", tag_end + 1) + if close < 0: + out.append(text[pos:]) + break + out.append(text[pos:start]) + pos = close + len("") + return "".join(out) + + def _parse_stepfun_tool_call(tool_name: str, body: str) -> Optional[ToolBlock]: """Parse StepFun native tool-call tokens into an Odysseus ToolBlock.""" tool_name = tool_name.lower().replace("-", "_").replace(".", "_") @@ -780,6 +802,6 @@ def strip_tool_blocks(text: str, skip_fenced: bool = False) -> str: _, (start, end) = raw_web_json cleaned = cleaned[:start] + cleaned[end:] # Strip bare blocks not wrapped in - cleaned = _XML_INVOKE_RE.sub('', cleaned) + cleaned = _strip_bare_invoke_markup(cleaned) cleaned = re.sub(r'\n{3,}', '\n\n', cleaned) return cleaned.strip()