refactor(constants): single source of truth for data dir (#3368)

* refactor(constants): single source of truth for data dir + merge core/src constants

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

* docs(contributing): use named src.constants for data paths, drop core/constants references

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Mike
2026-06-08 09:58:52 +02:00
committed by GitHub
parent adc6ac9394
commit ac94885c84
56 changed files with 279 additions and 243 deletions
+4 -2
View File
@@ -14,6 +14,8 @@ import uuid
import time
from typing import Dict, Optional, Tuple
from src.constants import GENERATED_IMAGES_DIR
logger = logging.getLogger(__name__)
AI_CHAT_TIMEOUT = 120 # seconds for a single LLM call
@@ -1715,7 +1717,7 @@ async def do_generate_image(content: str, session_id: Optional[str] = None, owne
# GPT image models always return b64_json; DALL-E may return url
if img.get("b64_json"):
img_dir = Path("data/generated_images")
img_dir = Path(GENERATED_IMAGES_DIR)
img_dir.mkdir(parents=True, exist_ok=True)
filename = f"{uuid.uuid4().hex[:12]}.png"
img_path = img_dir / filename
@@ -1728,7 +1730,7 @@ async def do_generate_image(content: str, session_id: Optional[str] = None, owne
try:
dl_resp = httpx.get(img["url"], timeout=60)
if dl_resp.status_code == 200:
img_dir = Path("data/generated_images")
img_dir = Path(GENERATED_IMAGES_DIR)
img_dir.mkdir(parents=True, exist_ok=True)
filename = f"{uuid.uuid4().hex[:12]}.png"
img_path = img_dir / filename
+4 -3
View File
@@ -38,9 +38,10 @@ from core.platform_compat import (
pid_alive,
)
_DATA_DIR = Path(os.environ.get("DATA_DIR", "data"))
_JOBS_DIR = _DATA_DIR / "bg_jobs"
_STORE = _DATA_DIR / "bg_jobs.json"
from src.constants import BG_JOBS_DIR, BG_JOBS_FILE
_JOBS_DIR = Path(BG_JOBS_DIR)
_STORE = Path(BG_JOBS_FILE)
# A job that runs longer than this is presumed stuck and reaped (the agent
# still gets a "timed out" follow-up so nothing hangs forever).
+9 -8
View File
@@ -12,7 +12,8 @@ from typing import Tuple
from src.auth_helpers import owner_filter
from core.platform_compat import IS_WINDOWS, find_bash
from core.constants import DATA_DIR, internal_api_base
from core.constants import internal_api_base
from src.constants import DATA_DIR, DEEP_RESEARCH_DIR, TIDY_CALENDAR_STATE_FILE, EMAIL_URGENCY_CACHE_DIR, COOKBOOK_STATE_FILE
logger = logging.getLogger(__name__)
@@ -349,7 +350,7 @@ async def action_tidy_research(owner: str, **kwargs) -> Tuple[str, bool]:
try:
from pathlib import Path
import json as _json
research_dir = Path("data/deep_research")
research_dir = Path(DEEP_RESEARCH_DIR)
if not research_dir.exists():
raise TaskNoop("no research directory")
files = list(research_dir.glob("*.json"))
@@ -387,7 +388,7 @@ async def action_tidy_calendar(owner: str, **kwargs) -> Tuple[str, bool]:
from core.database import SessionLocal, CalendarEvent
from sqlalchemy import func
STATE_FILE = Path("data/tidy_calendar_state.json")
STATE_FILE = Path(TIDY_CALENDAR_STATE_FILE)
last_watermark = None
try:
if STATE_FILE.exists():
@@ -1304,12 +1305,12 @@ async def action_ping_notes(owner: str, **kwargs) -> Tuple[str, bool]:
# users' entries (review C4). Legacy path kept as fallback so a
# single-user install (empty owner) doesn't lose its history.
_owner_slug = "".join(c if (c.isalnum() or c in "-_.@") else "_" for c in (owner or "default"))
STATE = _P(f"data/note_pings_{_owner_slug}.json")
STATE = _P(DATA_DIR) / f"note_pings_{_owner_slug}.json"
STATE.parent.mkdir(parents=True, exist_ok=True)
# One-time migration: if legacy global file exists and per-owner file
# doesn't, seed from global (entries for OTHER owners still get pruned
# on their first run — acceptable, prevents silent loss).
_legacy = _P("data/note_pings.json")
_legacy = _P(DATA_DIR) / "note_pings.json"
if _legacy.exists() and not STATE.exists():
try:
STATE.write_text(_legacy.read_text(encoding="utf-8"), encoding="utf-8")
@@ -1466,8 +1467,8 @@ async def action_check_email_urgency(owner: str, **kwargs) -> Tuple[str, bool]:
# notified_uids / urgency counts. Empty owner falls back to a generic
# filename for single-user installs (matches prior behaviour).
_owner_slug = "".join(c if (c.isalnum() or c in "-_.@") else "_" for c in (owner or "default"))
STATE_PATH = _P(f"data/email_urgency_state_{_owner_slug}.json")
CACHE_DIR = _P("data/email_urgency_cache")
STATE_PATH = _P(DATA_DIR) / f"email_urgency_state_{_owner_slug}.json"
CACHE_DIR = _P(EMAIL_URGENCY_CACHE_DIR)
CACHE_DIR.mkdir(parents=True, exist_ok=True)
STATE_PATH.parent.mkdir(parents=True, exist_ok=True)
AGE_CUTOFF = _dt.utcnow() - _td(days=7)
@@ -2043,7 +2044,7 @@ async def action_cookbook_serve(
except Exception:
end_after_min = 0
state_path = Path(DATA_DIR) / "cookbook_state.json"
state_path = Path(COOKBOOK_STATE_FILE)
try:
state = json.loads(state_path.read_text(encoding="utf-8")) if state_path.exists() else {}
except Exception:
+10 -8
View File
@@ -4,6 +4,8 @@ from typing import List, Optional
from pydantic_settings import BaseSettings, SettingsConfigDict
from pydantic import Field, field_validator
from src.constants import DATA_DIR as _DATA_DIR_CONST
# Cross-platform OS flag, exposed here so callers can `from src.config import
# IS_WINDOWS`. Defined locally (a trivial `os.name == "nt"`) rather than imported
# from core.platform_compat, to keep this dependency-light config module from
@@ -20,13 +22,13 @@ class DataConfig(BaseSettings):
base_dir: Path = Field(default=Path(__file__).parent.parent, description="Base directory for the application")
# Data paths
data_dir: Path = Field(default=Path("data"), description="Main data directory")
uploads_dir: Path = Field(default=Path("data/uploads"), description="Directory for uploaded files")
sessions_file: Path = Field(default=Path("data/sessions.json"), description="Sessions storage file")
memory_file: Path = Field(default=Path("data/memory.json"), description="Memory storage file")
memory_doc: Path = Field(default=Path("data/memory_doc.md"), description="Memory document file")
personal_dir: Path = Field(default=Path("data/personal_docs"), description="Personal documents directory")
runbook_dir: Path = Field(default=Path("data/personal_docs/runbook"), description="Runbook directory")
data_dir: Path = Field(default=Path(_DATA_DIR_CONST), description="Main data directory")
uploads_dir: Path = Field(default=Path(_DATA_DIR_CONST) / "uploads", description="Directory for uploaded files")
sessions_file: Path = Field(default=Path(_DATA_DIR_CONST) / "sessions.json", description="Sessions storage file")
memory_file: Path = Field(default=Path(_DATA_DIR_CONST) / "memory.json", description="Memory storage file")
memory_doc: Path = Field(default=Path(_DATA_DIR_CONST) / "memory_doc.md", description="Memory document file")
personal_dir: Path = Field(default=Path(_DATA_DIR_CONST) / "personal_docs", description="Personal documents directory")
runbook_dir: Path = Field(default=Path(_DATA_DIR_CONST) / "personal_docs" / "runbook", description="Runbook directory")
# Upload settings
max_upload_size: int = Field(default=10 * 1024 * 1024, description="Maximum upload size in bytes (10MB)")
@@ -139,7 +141,7 @@ class AppConfig(BaseSettings):
base_dir = Path(__file__).parent.parent
# Convert string paths to Path objects relative to base_dir
data_dir = base_dir / "data"
data_dir = Path(_DATA_DIR_CONST)
# Get values from the input dict or use defaults
max_upload_size = v.get("max_upload_size", 10 * 1024 * 1024) if isinstance(v, dict) else 10 * 1024 * 1024
+58 -1
View File
@@ -7,9 +7,12 @@ APP_VERSION = "1.0.0"
# Base paths
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + "/"
STATIC_DIR = os.path.join(BASE_DIR, "static")
DATA_DIR = os.path.join(BASE_DIR, "data")
DATA_DIR = os.getenv("ODYSSEUS_DATA_DIR", os.path.join(BASE_DIR, "data"))
# Data file paths
# Single source of truth: every persisted file/dir lives under DATA_DIR, which
# is the ONLY place ODYSSEUS_DATA_DIR is read. Import these constants instead of
# re-deriving paths from __file__ or a relative "data" literal.
SESSIONS_FILE = os.path.join(DATA_DIR, "sessions.json")
MEMORY_FILE = os.path.join(DATA_DIR, "memory.json")
MEMORY_DOC = os.path.join(DATA_DIR, "memory_doc.md")
@@ -18,6 +21,41 @@ RUNBOOK_DIR = os.path.join(PERSONAL_DIR, "runbook")
UPLOAD_DIR = os.path.join(DATA_DIR, "uploads")
FEATURES_FILE = os.path.join(DATA_DIR, "features.json")
SETTINGS_FILE = os.path.join(DATA_DIR, "settings.json")
AUTH_FILE = os.path.join(DATA_DIR, "auth.json")
USER_PREFS_FILE = os.path.join(DATA_DIR, "user_prefs.json")
PRESETS_FILE = os.path.join(DATA_DIR, "presets.json")
INTEGRATIONS_FILE = os.path.join(DATA_DIR, "integrations.json")
CONTACTS_FILE = os.path.join(DATA_DIR, "contacts.json")
APP_KEY_FILE = os.path.join(DATA_DIR, ".app_key")
EMBEDDING_ENDPOINT_FILE = os.path.join(DATA_DIR, "embedding_endpoint.json")
COOKBOOK_STATE_FILE = os.path.join(DATA_DIR, "cookbook_state.json")
BG_JOBS_FILE = os.path.join(DATA_DIR, "bg_jobs.json")
VAULT_FILE = os.path.join(DATA_DIR, "vault.json")
TIDY_CALENDAR_STATE_FILE = os.path.join(DATA_DIR, "tidy_calendar_state.json")
SKILLS_FILE = os.path.join(DATA_DIR, "skills.json")
APP_DB = os.path.join(DATA_DIR, "app.db")
SCHEDULED_EMAILS_DB = os.path.join(DATA_DIR, "scheduled_emails.db")
EMAIL_CACHE_DB = os.path.join(DATA_DIR, "email_cache.db")
# Data subdirectories
PERSONAL_UPLOADS_DIR = os.path.join(DATA_DIR, "personal_uploads")
EMOJI_CACHE_DIR = os.path.join(DATA_DIR, "emoji_cache")
RAG_DIR = os.path.join(DATA_DIR, "rag")
CHROMA_DIR = os.path.join(DATA_DIR, "chroma")
BG_JOBS_DIR = os.path.join(DATA_DIR, "bg_jobs")
DEEP_RESEARCH_DIR = os.path.join(DATA_DIR, "deep_research")
MCP_OAUTH_DIR = os.path.join(DATA_DIR, "mcp_oauth")
GENERATED_IMAGES_DIR = os.path.join(DATA_DIR, "generated_images")
TTS_CACHE_DIR = os.path.join(DATA_DIR, "tts_cache")
EMAIL_URGENCY_CACHE_DIR = os.path.join(DATA_DIR, "email_urgency_cache")
SKILLS_DIR = os.path.join(DATA_DIR, "skills")
GALLERY_DIR = os.path.join(DATA_DIR, "gallery")
GALLERY_UPLOADS_DIR = os.path.join(DATA_DIR, "gallery_uploads")
MEMORY_VECTORS_DIR = os.path.join(DATA_DIR, "memory_vectors")
# Paths with an intentional dedicated env override, defaulting under DATA_DIR.
MAIL_ATTACHMENTS_DIR = os.getenv("ODYSSEUS_MAIL_ATTACHMENTS_DIR", os.path.join(DATA_DIR, "mail-attachments"))
FASTEMBED_CACHE_DIR = os.getenv("FASTEMBED_CACHE_PATH", os.path.join(DATA_DIR, "fastembed_cache"))
# Agent tool output limits (single source of truth — imported by tool_execution.py,
# tool_implementations.py, agent_tools.py, and any other module that needs them)
@@ -44,3 +82,22 @@ CLEANUP_INTERVAL_HOURS = int(os.getenv("CLEANUP_INTERVAL_HOURS", "24"))
# Default parameters
DEFAULT_TEMPERATURE = 1.0
DEFAULT_MAX_TOKENS = 0
def internal_api_base() -> str:
"""Base URL for in-process loopback calls to Odysseus's own API.
Agent tools and background jobs reach admin-gated routes by calling the
running server over HTTP. Resolution order:
1. ODYSSEUS_INTERNAL_BASE - explicit override (e.g. behind a TLS proxy).
2. APP_PORT - http://127.0.0.1:$APP_PORT (docker-compose).
3. Fallback http://127.0.0.1:7000 - legacy default.
127.0.0.1 (not "localhost") avoids IPv6/DNS ambiguity for a strictly-local
call. Without this, loopback tools fail with "All connection attempts
failed" whenever the server is not on port 7000.
"""
override = os.environ.get("ODYSSEUS_INTERNAL_BASE")
if override:
return override.rstrip("/")
return f"http://127.0.0.1:{os.environ.get('APP_PORT', '7000')}"
+3 -2
View File
@@ -19,7 +19,8 @@ import time
from pathlib import Path
import httpx
from core.constants import DATA_DIR, internal_api_base
from core.constants import internal_api_base
from src.constants import COOKBOOK_STATE_FILE
logger = logging.getLogger(__name__)
@@ -130,7 +131,7 @@ async def _stop_serve(session_id: str, remote_host: str = "", ssh_port: str = ""
async def _tick() -> None:
state_path = Path(DATA_DIR) / "cookbook_state.json"
state_path = Path(COOKBOOK_STATE_FILE)
if not state_path.exists():
return
try:
+4 -8
View File
@@ -14,6 +14,8 @@ Set EMBEDDING_URL in .env, e.g.:
import os
from src.constants import FASTEMBED_CACHE_DIR, EMBEDDING_ENDPOINT_FILE
# Windows: force HuggingFace/fastembed to COPY model files rather than symlink
# them. On a network-share/UNC cache dir Windows can't follow HF's symlinks
# ([WinError 1463] "symbolic link cannot be followed"), so ONNX fails to load the
@@ -117,10 +119,7 @@ class FastEmbedClient:
# Persistent cache under data/ so the model survives reboots and so
# the download lands exactly where the admin panel's _is_downloaded()
# check looks (both default to this same path).
cache_dir = os.getenv("FASTEMBED_CACHE_PATH") or os.path.join(
os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
"data", "fastembed_cache",
)
cache_dir = FASTEMBED_CACHE_DIR
os.makedirs(cache_dir, exist_ok=True)
# Windows self-heal: the HuggingFace-hub cache stores model files as
# symlinks (snapshots/<rev>/model.onnx -> ../../blobs/<hash>). On a
@@ -188,10 +187,7 @@ class FastEmbedClient:
def _load_persisted_endpoint() -> dict:
"""Load the custom embedding endpoint saved from the admin panel."""
try:
endpoint_file = os.path.join(
os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
"data", "embedding_endpoint.json",
)
endpoint_file = EMBEDDING_ENDPOINT_FILE
if os.path.exists(endpoint_file):
import json
data = json.loads(open(endpoint_file, encoding="utf-8").read())
+3 -3
View File
@@ -12,6 +12,8 @@ import os
from datetime import datetime
from typing import Optional
from src.constants import AUTH_FILE
logger = logging.getLogger(__name__)
_task_scheduler = None
@@ -54,9 +56,7 @@ def _resolve_event_owner(owner: Optional[str]) -> Optional[str]:
return owner
try:
from src.constants import DATA_DIR
auth_path = os.path.join(DATA_DIR, "auth.json")
auth_path = AUTH_FILE
with open(auth_path, "r", encoding="utf-8") as f:
users = (json.load(f).get("users") or {})
for username, data in users.items():
+3 -1
View File
@@ -4,8 +4,10 @@ from pathlib import Path
from fastapi import HTTPException
from src.constants import GENERATED_IMAGES_DIR
GENERATED_IMAGE_DIR = Path("data/generated_images")
GENERATED_IMAGE_DIR = Path(GENERATED_IMAGES_DIR)
GENERATED_IMAGE_RE = re.compile(
r"^[a-f0-9]{8,64}\.(png|jpg|jpeg|webp|gif|mp4|mov|webm|mkv|m4v)$"
)
+3 -2
View File
@@ -10,10 +10,11 @@ import httpx
from core.atomic_io import atomic_write_json
from core.platform_compat import safe_chmod
from src.secret_storage import decrypt, encrypt, is_encrypted
from src.constants import DATA_DIR, INTEGRATIONS_FILE, SETTINGS_FILE
log = logging.getLogger(__name__)
DATA_FILE = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data", "integrations.json")
DATA_FILE = INTEGRATIONS_FILE
# ---------------------------------------------------------------------------
# Presets
@@ -471,7 +472,7 @@ def get_integrations_prompt() -> str:
def migrate_from_settings() -> None:
"""If data/settings.json has miniflux_url and miniflux_api_key, create a
Miniflux integration and clear those keys from settings."""
settings_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data", "settings.json")
settings_path = SETTINGS_FILE
if not os.path.exists(settings_path):
return
+3 -1
View File
@@ -7,6 +7,8 @@ A thin wrapper around VectorRAG for backward compatibility and additional featur
import logging
from typing import List, Dict, Any, Optional
from src.constants import CHROMA_DIR
# Try to import from different possible locations
try:
from rag_vector import VectorRAG
@@ -24,7 +26,7 @@ class RAGManager:
Most methods delegate directly to VectorRAG.
"""
def __init__(self, persist_directory: str = "data/chroma"):
def __init__(self, persist_directory: str = CHROMA_DIR):
"""Initialize the RAGManager with VectorRAG."""
self.vector_rag = VectorRAG(persist_directory=persist_directory)
logger.info("RAGManager initialized as wrapper for VectorRAG")
+3 -2
View File
@@ -6,6 +6,8 @@ import logging
import time
from pathlib import Path
from src.constants import RAG_DIR
logger = logging.getLogger(__name__)
rag_instance = None
@@ -41,8 +43,7 @@ def get_rag_manager():
try:
from src.rag_vector import VectorRAG
base_dir = Path(__file__).parent.parent
persist_dir = os.path.join(base_dir, "data", "rag")
persist_dir = RAG_DIR
rag_instance = VectorRAG(persist_directory=persist_dir)
if not rag_instance.healthy:
+3 -1
View File
@@ -12,6 +12,8 @@ import re
import logging
import numpy as np
from typing import List, Dict, Any, Optional, Set
from src.constants import CHROMA_DIR
from pathlib import Path
from src.embedding_lanes import (
@@ -51,7 +53,7 @@ def _generate_doc_id(text: str, owner: str = "") -> str:
class VectorRAG:
"""RAG system using ChromaDB vector storage with hybrid search."""
def __init__(self, persist_directory: str = "data/chroma"):
def __init__(self, persist_directory: str = CHROMA_DIR):
self.persist_directory = persist_directory
self._collection = None
self._model = None
+2 -1
View File
@@ -16,10 +16,11 @@ from pathlib import Path
from typing import Optional, Dict
from src.research_utils import strip_thinking, is_low_quality
from src.constants import DEEP_RESEARCH_DIR
logger = logging.getLogger(__name__)
RESEARCH_DATA_DIR = Path("data/deep_research")
RESEARCH_DATA_DIR = Path(DEEP_RESEARCH_DIR)
_RESEARCH_SESSION_ID_RE = re.compile(r"^[A-Za-z0-9-]{1,128}$")
+2 -1
View File
@@ -25,10 +25,11 @@ from pathlib import Path
from cryptography.fernet import Fernet, InvalidToken
from core.platform_compat import safe_chmod
from src.constants import APP_KEY_FILE
logger = logging.getLogger(__name__)
_KEY_PATH = Path(__file__).resolve().parent.parent / "data" / ".app_key"
_KEY_PATH = Path(APP_KEY_FILE)
_PREFIX = "enc:"
_fernet: Fernet | None = None
+2 -2
View File
@@ -20,14 +20,14 @@ from typing import Any, Awaitable, Callable, Dict, Optional, Tuple
from src.tool_security import is_public_blocked_tool, owner_is_admin_or_single_user
from src.tool_policy import ToolPolicy
from src.constants import MAX_OUTPUT_CHARS, MAX_READ_CHARS, MAX_DIFF_LINES
from src.constants import MAX_OUTPUT_CHARS, MAX_READ_CHARS, MAX_DIFF_LINES, DATA_DIR
# Persistent working directory for agent subprocesses.
# Resolves to <repo_root>/data, which is the bind-mounted volume in Docker
# (/app/data) and the local data directory for manual installs.
# Using this as cwd and HOME prevents the agent from silently creating files
# in ephemeral container layers that are lost on the next rebuild.
_AGENT_WORKDIR = str(pathlib.Path(__file__).parent.parent / "data")
_AGENT_WORKDIR = DATA_DIR
def _unified_diff(old: str, new: str, path: str) -> Optional[Dict[str, Any]]:
+4 -4
View File
@@ -12,7 +12,7 @@ import os
import re
from typing import Any, Dict, List, Optional
from src.constants import MAX_OUTPUT_CHARS, MAX_READ_CHARS
from src.constants import MAX_OUTPUT_CHARS, MAX_READ_CHARS, DEEP_RESEARCH_DIR, VAULT_FILE
from core.constants import internal_api_base
@@ -4057,7 +4057,7 @@ async def do_manage_research(content: str, owner: Optional[str] = None) -> Dict:
args = {}
action = (args.get("action") or "list").lower()
rid = (args.get("id") or args.get("session_id") or args.get("research_id") or "").strip()
data_dir = _Path("data/deep_research")
data_dir = _Path(DEEP_RESEARCH_DIR)
# SECURITY: the research id is interpolated straight into a filesystem
# path (data/deep_research/<rid>.json) for read AND delete. Without this
@@ -4302,7 +4302,7 @@ async def do_manage_contact(content: str, owner: Optional[str] = None) -> Dict:
def _load_vault_config() -> Dict:
"""Load Vaultwarden config from data/vault.json."""
from pathlib import Path
p = Path("data/vault.json")
p = Path(VAULT_FILE)
if p.exists():
try:
return json.loads(p.read_text(encoding="utf-8"))
@@ -4456,7 +4456,7 @@ async def do_vault_unlock(content: str, owner: Optional[str] = None) -> Dict:
# Save session to vault.json
from pathlib import Path
p = Path("data/vault.json")
p = Path(VAULT_FILE)
cfg = {}
if p.exists():
try: