mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-17 10:15:27 -04:00
4f7061fd61
Two months of iteration on the Settings panel, integration forms, and small visual nudges across the app. Highlights: Settings restructure - Add Models: split into separate Local + API cards (no more in-card tabs); each fuses Type/Provider with the URL input. - Added Models: new dedicated sidebar tab, with Probe + Clear-offline pulled into its header; Local/API sub-section icons accent-tinted. - Search: Web Search and a new Deep Research card (Model + tuning), with a cross-link to AI Defaults. Provider hints use real clickable anchors; Web Search Test button shows a whirlpool spinner. - AI Defaults: Image Generation card returns; Research Model card carries only Endpoint+Model with a cross-link to Search; Vision / Default / Utility fallbacks unified under one numbered-row design matching Search's chain. - API Permissions (was 'API Tokens'): per-row rename, inline Permissions toggle that expands the scope-edit panel, in-field copy icons (icon→check on success). Empty state accent-tinted. - Integrations: + Add Integration drops a type-picker menu directly under the button (drop-up on tight viewports); each integration form (API, CalDAV, CardDAV, Email, Codex/Claude, Vault, MCP) uses the same accent-outlined Save/Test/Cancel buttons right-aligned. - Danger Zone: Wipe→Delete with trash icons; new 'Delete everything' row at the bottom that loops every category. AI Synthesis (Reminders) - Persona dropdown sourced from PROMPT_TEMPLATES + custom preset. - src/reminder_personas.py mirrors the five built-ins for the server-side synthesis path. - dispatch_reminder() reads reminder_llm_persona and uses the persona's system prompt; empty/unknown falls back to warm-neutral. Esc handling - Kebab menus and the provider picker intercept Esc in capture phase so dismissing a popup no longer closes the whole Settings modal. Accent tinting - Scoped CSS rule across data-settings-panel=ai/services/added-models/ search/integrations/reminders for card h2 icons + the Added Models sub-section icons. Codex/Claude integration form - No more auto-creation on form open — explicit Create token button. - New tokens start with every scope granted; existing tokens move out of the integration form into the API Permissions card. - Setup reveal: copy buttons inline inside the token + setup code blocks; shorter subtitle wording. Misc visual polish - Save/Test/Cancel uniformly accent-outlined and right-aligned on every integration form. - Provider logos render inline next to the search fallback selects and the Deep Research Search dropdown. - Trash icons in fallback rows bumped to 20x20 so they fill the 32px button. - Image generation default flipped to off.
107 lines
4.0 KiB
Python
107 lines
4.0 KiB
Python
"""Helpers for the optional markitdown document-extraction dependency.
|
|
|
|
markitdown (MIT, Microsoft) converts Office/EPUB documents to Markdown, which is
|
|
more token-efficient and model-legible than a raw text dump. It is **optional**:
|
|
install with `pip install -r requirements-optional.txt`. When absent, callers
|
|
degrade gracefully (chat shows a hint; the RAG indexer skips the file) — the MIT
|
|
core never hard-depends on it. Mirrors the optional-dependency pattern in
|
|
`src/pdf_runtime.py`.
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
MARKITDOWN_MISSING = (
|
|
"Office/EPUB document extraction requires markitdown. Install optional "
|
|
"dependencies with `pip install -r requirements-optional.txt`."
|
|
)
|
|
|
|
# Formats routed through markitdown. PDFs stay on pypdf (src/document_processor
|
|
# and src/personal_docs); plain text/code/csv/json/markdown/html stay on the
|
|
# cheaper built-in text path. These are the formats currently dropped entirely.
|
|
MARKITDOWN_EXTS = frozenset({".docx", ".pptx", ".xlsx", ".xls", ".epub"})
|
|
|
|
|
|
def is_markitdown_format(path: str) -> bool:
|
|
"""True if the file extension is one we route through markitdown."""
|
|
if not isinstance(path, str):
|
|
return False
|
|
return os.path.splitext(path)[1].lower() in MARKITDOWN_EXTS
|
|
|
|
|
|
def load_markitdown():
|
|
"""Return the MarkItDown class, or raise a user-facing setup hint."""
|
|
try:
|
|
from markitdown import MarkItDown # optional dependency
|
|
except ImportError as exc:
|
|
raise RuntimeError(MARKITDOWN_MISSING) from exc
|
|
return MarkItDown
|
|
|
|
|
|
def _extract_docx_native(path: str) -> str | None:
|
|
"""Pure-Python .docx text extractor — no external deps.
|
|
|
|
A .docx file is just a zip of XML. The body prose lives in <w:t> runs
|
|
inside <w:p> paragraphs. Iterating with ElementTree (rather than
|
|
re.findall) keeps paragraph breaks intact and lets the XML parser handle
|
|
namespaces + entity unescaping. Loses tables, footnotes, images and
|
|
list bullets — keeps ~95% of "summarize this doc" content, which is the
|
|
case people hit when markitdown isn't installed.
|
|
"""
|
|
import zipfile
|
|
import xml.etree.ElementTree as ET
|
|
|
|
ns = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
|
|
try:
|
|
with zipfile.ZipFile(path) as z:
|
|
xml_bytes = z.read("word/document.xml")
|
|
except (zipfile.BadZipFile, KeyError, OSError):
|
|
return None
|
|
try:
|
|
root = ET.fromstring(xml_bytes)
|
|
except ET.ParseError:
|
|
return None
|
|
paragraphs: list[str] = []
|
|
for para in root.iter(f"{ns}p"):
|
|
runs = [t.text or "" for t in para.iter(f"{ns}t")]
|
|
line = "".join(runs).strip()
|
|
if line:
|
|
paragraphs.append(line)
|
|
return "\n\n".join(paragraphs) if paragraphs else None
|
|
|
|
|
|
def convert_to_markdown(path: str) -> str | None:
|
|
"""Convert a document to Markdown text via markitdown.
|
|
|
|
Returns the extracted Markdown, or ``None`` if markitdown is unavailable or
|
|
the conversion fails — callers degrade gracefully rather than erroring.
|
|
|
|
Fallback: when markitdown isn't installed and the file is a .docx, run
|
|
the bundled pure-Python extractor so the most common case (Word docs)
|
|
works out of the box. Other Office/EPUB formats still need markitdown.
|
|
"""
|
|
try:
|
|
markitdown_cls = load_markitdown()
|
|
except RuntimeError:
|
|
if isinstance(path, str) and path.lower().endswith(".docx"):
|
|
text = _extract_docx_native(path)
|
|
if text:
|
|
logger.info(
|
|
"markitdown not installed — used native .docx extractor for %s",
|
|
path,
|
|
)
|
|
return text
|
|
logger.warning("markitdown not installed; cannot extract %s", path)
|
|
return None
|
|
try:
|
|
result = markitdown_cls().convert(path)
|
|
text = getattr(result, "text_content", None)
|
|
if text is None:
|
|
text = getattr(result, "markdown", None)
|
|
return text
|
|
except Exception as e:
|
|
logger.warning("markitdown failed to convert %s: %s", path, e)
|
|
return None
|