mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-17 10:15:27 -04:00
fix(chat): stabilize system prompt, sequence memory extraction, and send stable session id to preserve KV cache (#3360)
* fix(chat): stabilize system prompt, sequence memory extraction, send stable session id to preserve KV cache Fixes #2927. As diagnosed in the issue, three things in Odysseus's request pattern actively destroyed local backends' (llama.cpp / LM Studio) KV-cache continuity, forcing a full prompt re-evaluation (15-30s+) on every turn: 1. Dynamic content folded into the system prompt every turn. Both the chat preface (ChatProcessor.build_context_preface) and the agent system prompt (_build_system_prompt) injected current_datetime_prompt() — text that changes every minute — directly into system-role messages, which llm_core then concatenates into the single system message sent as the cached prefix. Any byte difference there invalidates the entire cache. Moved this to a new current_datetime_context_message() helper that returns a standalone user-role message, inserted near the end of the array (right before the latest user turn) instead of mixed into the system prompt. The static system prefix (preset prompt + safety policy + agent base prompt) now stays byte-identical across turns of the same session. 2. Memory/skill extraction side-requests competed with the main completion. run_post_response_tasks fired extract_and_store / maybe_extract_skill via asyncio.create_task — fire-and-forget coroutines that could overlap the next turn's main request and steal llama.cpp's limited processing slots, evicting the cached checkpoint. They're now queued through a new _run_extraction_jobs_sequentially helper that waits for the session's stream to go idle and runs the jobs strictly one at a time. 3. No stable session identifier was sent to local backends, so llama.cpp assigned a new processing slot via LRU every turn ("session_id=<empty> server-selected (LCP/LRU)"), losing slot affinity. Added _apply_local_cache_affinity() in llm_core, which sets session_id and cache_prompt: true on outgoing payloads — gated to self-hosted OpenAI-compatible endpoints only (never api.openai.com or other cloud providers, which reject unrecognized request fields with a 400). Threaded session_id through stream_llm / llm_call_async / stream_agent_loop from the existing Odysseus session id. Tests in tests/test_kv_cache_invalidation_2927.py exercise the real payload- assembly and scheduling code paths: byte-identical system prefix across two turns of the same session (with a regression check that genuinely changed instructions DO still change it), the dynamic time block landing as a user-role message, extraction jobs waiting for the stream to go idle and running sequentially, and the outgoing payload carrying a stable session_id (same across turns of one session, different across sessions) only for self-hosted endpoints. Updated tests/test_user_time.py for the new message placement. * fix(tests): accept owner= kwarg in normalize_model_id monkeypatch The upstream normalize_model_id signature now takes an owner= keyword argument, and chat_helpers.py passes owner=getattr(sess, "owner", None) at the call site. Update the test stub lambda to **kwargs so it handles the new argument without breaking, and update chat_helpers.py to forward the owner parameter consistently. --------- Co-authored-by: Alexandre Teixeira <111787685+alteixeira20@users.noreply.github.com>
This commit is contained in:
+91
-5
@@ -615,6 +615,26 @@ async def build_chat_context(
|
||||
# Build messages
|
||||
messages = preface + sess.get_context_messages()
|
||||
|
||||
# Current date/time — injected as a standalone *user*-role context message
|
||||
# placed immediately before the latest user turn, NOT folded into the
|
||||
# system prompt. Its text changes every minute, and local OpenAI-compatible
|
||||
# backends (llama.cpp / LM Studio) key their KV-cache prefix off the
|
||||
# system message byte-for-byte; mixing ever-changing timestamp text into
|
||||
# it would invalidate the cached prefix on every request (issue #2927).
|
||||
# Placing it at the tail also keeps it out of the stable
|
||||
# preface+history prefix, so that prefix stays byte-identical turn over
|
||||
# turn (modulo the genuinely new history entries) and the cache survives.
|
||||
if not agent_mode:
|
||||
try:
|
||||
from src.user_time import current_datetime_context_message
|
||||
_dt_msg = current_datetime_context_message()
|
||||
if messages and messages[-1].get("role") == "user":
|
||||
messages.insert(len(messages) - 1, _dt_msg)
|
||||
else:
|
||||
messages.append(_dt_msg)
|
||||
except Exception:
|
||||
logger.debug("Failed to add current date/time context", exc_info=True)
|
||||
|
||||
# Auto-compact
|
||||
messages, context_length, was_compacted = await maybe_compact(
|
||||
sess, sess.endpoint_url, sess.model, messages, sess.headers, owner=user,
|
||||
@@ -911,6 +931,54 @@ def save_assistant_response(
|
||||
return None
|
||||
|
||||
|
||||
def _is_session_stream_active(session_id: str) -> bool:
|
||||
"""Best-effort check for "is a chat completion currently streaming for
|
||||
this session?" — used to keep background extraction from overlapping a
|
||||
main completion and competing for the local backend's processing slots
|
||||
(issue #2927). Lazily imports the route module's live registry to avoid
|
||||
a circular import (chat_routes imports this module at load time)."""
|
||||
try:
|
||||
from routes import chat_routes as _cr
|
||||
return session_id in getattr(_cr, "_active_streams", {})
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
async def _run_extraction_jobs_sequentially(session_id: str, jobs: list, max_wait_s: float = 120.0):
|
||||
"""Run queued background-extraction coroutines one at a time, only once
|
||||
no chat completion is actively streaming for this session.
|
||||
|
||||
As diagnosed in issue #2927, firing memory/skill extraction concurrently
|
||||
with the main chat completion (or with each other) makes them compete for
|
||||
the local backend's limited processing slots, evicting the main
|
||||
conversation's cached KV-cache checkpoint and forcing a full prompt
|
||||
re-evaluation on the next turn. Waiting for the stream to go idle and then
|
||||
running the jobs strictly in sequence keeps at most one "side" request in
|
||||
flight against the backend at any time, and never alongside the user's
|
||||
own conversation.
|
||||
"""
|
||||
# Wait for the triggering turn's own stream to finish winding down (it
|
||||
# almost always already has by the time this task gets scheduled — this
|
||||
# is a small safety margin, not the primary mechanism).
|
||||
waited = 0.0
|
||||
poll = 0.25
|
||||
while _is_session_stream_active(session_id) and waited < max_wait_s:
|
||||
await asyncio.sleep(poll)
|
||||
waited += poll
|
||||
|
||||
for name, job in jobs:
|
||||
# Re-check before each job: a fast follow-up message from the user
|
||||
# may have started a new stream for this session while we waited.
|
||||
waited = 0.0
|
||||
while _is_session_stream_active(session_id) and waited < max_wait_s:
|
||||
await asyncio.sleep(poll)
|
||||
waited += poll
|
||||
try:
|
||||
await job
|
||||
except Exception:
|
||||
logger.warning("[bg-extract] %s extraction job failed for session %s", name, session_id, exc_info=True)
|
||||
|
||||
|
||||
def run_post_response_tasks(
|
||||
sess,
|
||||
session_manager,
|
||||
@@ -933,7 +1001,22 @@ def run_post_response_tasks(
|
||||
extract_skills: bool = True,
|
||||
allow_background_extraction: bool = True,
|
||||
):
|
||||
"""Fire background tasks after a completed response: memory extraction, webhooks, auto-name, skill extraction."""
|
||||
"""Fire background tasks after a completed response: memory extraction, webhooks, auto-name, skill extraction.
|
||||
|
||||
Memory/skill extraction are queued to run *sequentially*, after the main
|
||||
completion stream for this session has fully wound down — never
|
||||
concurrently with it or with each other. As diagnosed in issue #2927,
|
||||
firing these "side" LLM calls in parallel with the main chat completion
|
||||
makes them compete for the local backend's limited processing slots
|
||||
(llama.cpp defaults to 4), evicting the main conversation's cached
|
||||
checkpoint and forcing a full prompt re-evaluation on the next turn. By
|
||||
the time this function runs the main response is already saved, but the
|
||||
extraction calls themselves are still async — queuing them through
|
||||
``_queue_background_extraction`` keeps them from overlapping the *next*
|
||||
turn's request too.
|
||||
"""
|
||||
_extraction_jobs: list = []
|
||||
|
||||
# Memory extraction — only every 4th message pair to avoid excess LLM calls
|
||||
_msg_count = len(sess.history) if hasattr(sess, 'history') else 0
|
||||
_should_extract = (_msg_count >= 4) and (_msg_count % 4 == 0)
|
||||
@@ -943,10 +1026,10 @@ def run_post_response_tasks(
|
||||
t_url, t_model, t_headers = resolve_task_endpoint(
|
||||
sess.endpoint_url, sess.model, sess.headers, owner=owner,
|
||||
)
|
||||
asyncio.create_task(extract_and_store(
|
||||
_extraction_jobs.append(("memory", extract_and_store(
|
||||
sess, memory_manager, memory_vector,
|
||||
t_url, t_model, t_headers,
|
||||
))
|
||||
)))
|
||||
|
||||
# Skill extraction from complex agent runs. Only when the user actually
|
||||
# chose agent mode — not a chat we auto-escalated for a notes/calendar
|
||||
@@ -982,12 +1065,15 @@ def run_post_response_tasks(
|
||||
sess.endpoint_url, sess.model, sess.headers, owner=owner,
|
||||
)
|
||||
logger.debug("[skill-extract] dispatching extractor (model=%s)", s_model)
|
||||
asyncio.create_task(maybe_extract_skill(
|
||||
_extraction_jobs.append(("skill", maybe_extract_skill(
|
||||
sess, skills_manager,
|
||||
s_url, s_model, s_headers,
|
||||
agent_rounds, agent_tool_calls,
|
||||
owner=owner,
|
||||
))
|
||||
)))
|
||||
|
||||
if _extraction_jobs:
|
||||
asyncio.create_task(_run_extraction_jobs_sequentially(session_id, _extraction_jobs))
|
||||
|
||||
# Token accumulation
|
||||
if last_metrics:
|
||||
|
||||
Reference in New Issue
Block a user