mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-19 19:25:27 -04:00
fix(chat): stabilize system prompt, sequence memory extraction, and send stable session id to preserve KV cache (#3360)
* fix(chat): stabilize system prompt, sequence memory extraction, send stable session id to preserve KV cache Fixes #2927. As diagnosed in the issue, three things in Odysseus's request pattern actively destroyed local backends' (llama.cpp / LM Studio) KV-cache continuity, forcing a full prompt re-evaluation (15-30s+) on every turn: 1. Dynamic content folded into the system prompt every turn. Both the chat preface (ChatProcessor.build_context_preface) and the agent system prompt (_build_system_prompt) injected current_datetime_prompt() — text that changes every minute — directly into system-role messages, which llm_core then concatenates into the single system message sent as the cached prefix. Any byte difference there invalidates the entire cache. Moved this to a new current_datetime_context_message() helper that returns a standalone user-role message, inserted near the end of the array (right before the latest user turn) instead of mixed into the system prompt. The static system prefix (preset prompt + safety policy + agent base prompt) now stays byte-identical across turns of the same session. 2. Memory/skill extraction side-requests competed with the main completion. run_post_response_tasks fired extract_and_store / maybe_extract_skill via asyncio.create_task — fire-and-forget coroutines that could overlap the next turn's main request and steal llama.cpp's limited processing slots, evicting the cached checkpoint. They're now queued through a new _run_extraction_jobs_sequentially helper that waits for the session's stream to go idle and runs the jobs strictly one at a time. 3. No stable session identifier was sent to local backends, so llama.cpp assigned a new processing slot via LRU every turn ("session_id=<empty> server-selected (LCP/LRU)"), losing slot affinity. Added _apply_local_cache_affinity() in llm_core, which sets session_id and cache_prompt: true on outgoing payloads — gated to self-hosted OpenAI-compatible endpoints only (never api.openai.com or other cloud providers, which reject unrecognized request fields with a 400). Threaded session_id through stream_llm / llm_call_async / stream_agent_loop from the existing Odysseus session id. Tests in tests/test_kv_cache_invalidation_2927.py exercise the real payload- assembly and scheduling code paths: byte-identical system prefix across two turns of the same session (with a regression check that genuinely changed instructions DO still change it), the dynamic time block landing as a user-role message, extraction jobs waiting for the stream to go idle and running sequentially, and the outgoing payload carrying a stable session_id (same across turns of one session, different across sessions) only for self-hosted endpoints. Updated tests/test_user_time.py for the new message placement. * fix(tests): accept owner= kwarg in normalize_model_id monkeypatch The upstream normalize_model_id signature now takes an owner= keyword argument, and chat_helpers.py passes owner=getattr(sess, "owner", None) at the call site. Update the test stub lambda to **kwargs so it handles the new argument without breaking, and update chat_helpers.py to forward the owner parameter consistently. --------- Co-authored-by: Alexandre Teixeira <111787685+alteixeira20@users.noreply.github.com>
This commit is contained in:
+17
-2
@@ -890,9 +890,20 @@ def _build_system_prompt(
|
||||
|
||||
# Current date/time for every agent request. This is user-local when the
|
||||
# browser provided timezone headers, with a server-local fallback.
|
||||
#
|
||||
# IMPORTANT: this is intentionally NOT prepended into agent_prompt (the
|
||||
# system message) anymore. Its text changes every minute, and local
|
||||
# OpenAI-compatible backends (llama.cpp / LM Studio) key their KV-cache
|
||||
# prefix off the system message byte-for-byte — mixing ever-changing
|
||||
# timestamp text into the (already large, tool-laden) agent system prompt
|
||||
# would invalidate the cached prefix on every single request, forcing a
|
||||
# full prompt re-evaluation each turn (issue #2927). It's built here as a
|
||||
# standalone *user*-role message and inserted near the end of the array,
|
||||
# right alongside _doc_message / _skills_message, below.
|
||||
_datetime_message = None
|
||||
try:
|
||||
from src.user_time import current_datetime_prompt
|
||||
agent_prompt = current_datetime_prompt() + agent_prompt
|
||||
from src.user_time import current_datetime_context_message
|
||||
_datetime_message = current_datetime_context_message()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@@ -1229,6 +1240,9 @@ def _build_system_prompt(
|
||||
last_user_idx += 1 # the document message is now at last_user_idx
|
||||
if _skills_message:
|
||||
merged.insert(last_user_idx, _skills_message)
|
||||
last_user_idx += 1
|
||||
if _datetime_message:
|
||||
merged.insert(last_user_idx, _datetime_message)
|
||||
|
||||
return merged, mcp_schemas
|
||||
|
||||
@@ -2158,6 +2172,7 @@ async def stream_agent_loop(
|
||||
prompt_type=prompt_type if round_num == 1 else None,
|
||||
tools=all_tool_schemas if all_tool_schemas else None,
|
||||
timeout=agent_stream_timeout,
|
||||
session_id=session_id,
|
||||
):
|
||||
if time.time() > _round_deadline:
|
||||
logger.warning(f"[agent] round {round_num} stream exceeded wall-clock deadline; cutting off")
|
||||
|
||||
+13
-9
@@ -175,6 +175,19 @@ class ChatProcessor:
|
||||
|
||||
Returns:
|
||||
Tuple of (preface messages, rag_sources list)
|
||||
|
||||
Note on KV-cache friendliness: the ``system``-role messages assembled
|
||||
here are later concatenated into a single system message and sent as
|
||||
the very first thing in the payload (see ``llm_core``'s "consolidate
|
||||
system messages" step). Local OpenAI-compatible backends (llama.cpp /
|
||||
LM Studio) key their KV cache off the byte-identical token prefix, so
|
||||
*anything* that changes turn-to-turn — timestamps, retrieved snippets,
|
||||
per-turn counts — must NOT be folded into a system message here. Such
|
||||
content belongs in a separate ``user``/context message appended near
|
||||
the end of the array (see ``current_datetime_context_message`` and
|
||||
``untrusted_context_message`` callers in ``build_chat_context``),
|
||||
which keeps the static system prefix byte-identical across turns of
|
||||
the same session and lets the backend reuse its cached prefix.
|
||||
"""
|
||||
preface = []
|
||||
rag_sources = []
|
||||
@@ -185,15 +198,6 @@ class ChatProcessor:
|
||||
"role": "system",
|
||||
"content": preset_system_prompt
|
||||
})
|
||||
if not agent_mode:
|
||||
try:
|
||||
from src.user_time import current_datetime_prompt
|
||||
preface.append({
|
||||
"role": "system",
|
||||
"content": current_datetime_prompt(),
|
||||
})
|
||||
except Exception:
|
||||
logger.debug("Failed to add current date/time context", exc_info=True)
|
||||
preface.append({
|
||||
"role": "system",
|
||||
"content": UNTRUSTED_CONTEXT_POLICY,
|
||||
|
||||
+42
-2
@@ -455,6 +455,43 @@ def _detect_provider(url: str) -> str:
|
||||
return "openai"
|
||||
|
||||
|
||||
def _is_self_hosted_openai_compatible(url: str) -> bool:
|
||||
"""True for custom/local OpenAI-compatible servers (llama.cpp, LM Studio,
|
||||
vLLM, text-generation-webui, etc.) as opposed to api.openai.com itself.
|
||||
|
||||
Used to gate llama.cpp-server-specific payload extras (``session_id``,
|
||||
``cache_prompt``) — sending unrecognized top-level fields to OpenAI's
|
||||
actual API returns a 400 ("Unrecognized request argument"), but
|
||||
self-hosted servers generally ignore unknown fields and many (notably
|
||||
llama.cpp's server) use them for KV-cache slot affinity (issue #2927).
|
||||
"""
|
||||
return _detect_provider(url) == "openai" and not _host_match(url, "openai.com")
|
||||
|
||||
|
||||
def _apply_local_cache_affinity(payload: Dict, url: str, session_id: Optional[str]) -> None:
|
||||
"""Add llama.cpp-server slot-affinity hints to an outgoing payload, in place.
|
||||
|
||||
As diagnosed in issue #2927, llama.cpp assigns requests to processing
|
||||
slots via LRU when no stable identifier is present ("session_id=<empty>
|
||||
server-selected (LCP/LRU)"), which means consecutive turns of the same
|
||||
chat can land on different slots and lose their cached prefix entirely.
|
||||
Sending a stable ``session_id`` (derived from the Odysseus session) lets
|
||||
the server keep routing the same conversation to the same slot, and
|
||||
``cache_prompt: true`` asks it to retain/reuse the prefix it already has.
|
||||
|
||||
Both fields are llama.cpp / LM Studio extensions to the OpenAI schema; we
|
||||
only set them for self-hosted OpenAI-compatible endpoints (never
|
||||
api.openai.com or other cloud providers, which reject unrecognized
|
||||
top-level request fields).
|
||||
"""
|
||||
if not session_id:
|
||||
return
|
||||
if not _is_self_hosted_openai_compatible(url):
|
||||
return
|
||||
payload.setdefault("session_id", str(session_id))
|
||||
payload.setdefault("cache_prompt", True)
|
||||
|
||||
|
||||
def _provider_headers(provider: str, headers: Optional[Dict] = None) -> Dict[str, str]:
|
||||
h = {"Content-Type": "application/json"}
|
||||
if isinstance(headers, dict):
|
||||
@@ -1269,7 +1306,8 @@ async def llm_call_async(
|
||||
headers: Optional[Dict] = None,
|
||||
timeout: int = LLMConfig.STREAM_TIMEOUT,
|
||||
max_retries: int = LLMConfig.MAX_RETRIES,
|
||||
prompt_type: Optional[str] = None
|
||||
prompt_type: Optional[str] = None,
|
||||
session_id: Optional[str] = None,
|
||||
) -> str:
|
||||
"""Asynchronous LLM call using httpx with connection pooling, timeout, retry logic, and performance logging."""
|
||||
provider = _detect_provider(url)
|
||||
@@ -1369,6 +1407,7 @@ async def llm_call_async(
|
||||
# Suppress thinking for qwen3/gemma4 on Ollama /v1 — same as stream_llm.
|
||||
if _is_ollama_openai_compat_url(url) and _supports_thinking(model):
|
||||
payload["think"] = False
|
||||
_apply_local_cache_affinity(payload, url, session_id)
|
||||
|
||||
if _is_host_dead(target_url):
|
||||
raise HTTPException(503, f"Upstream {_host_key(target_url)} marked unreachable (cooldown active)")
|
||||
@@ -1426,7 +1465,7 @@ async def llm_call_async(
|
||||
async def stream_llm(url: str, model: str, messages: List[Dict], temperature: float = LLMConfig.DEFAULT_TEMPERATURE,
|
||||
max_tokens: int = LLMConfig.DEFAULT_MAX_TOKENS, headers: Optional[Dict] = None,
|
||||
timeout: int = LLMConfig.STREAM_TIMEOUT, prompt_type: Optional[str] = None,
|
||||
tools: Optional[List[Dict]] = None):
|
||||
tools: Optional[List[Dict]] = None, session_id: Optional[str] = None):
|
||||
"""Stream LLM responses with improved error handling.
|
||||
|
||||
Yields SSE chunks:
|
||||
@@ -1491,6 +1530,7 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
|
||||
# <think> blocks. Ollama /v1 accepts "think": false as a top-level param.
|
||||
if _is_ollama_openai_compat_url(url) and _supports_thinking(model):
|
||||
payload["think"] = False
|
||||
_apply_local_cache_affinity(payload, url, session_id)
|
||||
h = _provider_headers(provider, headers)
|
||||
if provider == "copilot":
|
||||
from src.copilot import apply_request_headers
|
||||
|
||||
+24
-1
@@ -9,7 +9,7 @@ from __future__ import annotations
|
||||
import re
|
||||
from contextvars import ContextVar
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import Optional
|
||||
from typing import Dict, Optional
|
||||
|
||||
|
||||
_USER_TZ_OFFSET_MIN: ContextVar[Optional[int]] = ContextVar("user_tz_offset_min", default=None)
|
||||
@@ -136,3 +136,26 @@ def current_datetime_prompt(now_utc: Optional[datetime] = None) -> str:
|
||||
"When scheduling a task with manage_tasks, scheduled_time is in UTC: "
|
||||
"convert the user's stated local time using the UTC offset above.\n\n"
|
||||
)
|
||||
|
||||
|
||||
def current_datetime_context_message(now_utc: Optional[datetime] = None) -> Dict[str, str]:
|
||||
"""Build the current-date/time context as a standalone chat message.
|
||||
|
||||
This intentionally returns a ``user``-role message rather than a
|
||||
``system``-role one. The text changes every turn (it embeds the current
|
||||
clock time down to the minute), and local OpenAI-compatible backends
|
||||
(llama.cpp / LM Studio) key their KV-cache prefix off the system message
|
||||
byte-for-byte — folding ever-changing timestamp text into the system
|
||||
message would invalidate the cached prefix on every single request (see
|
||||
issue #2927). Keeping it as a separate message placed near the end of the
|
||||
array (right before the latest user turn) lets the static system prompt
|
||||
stay byte-identical across turns while the model still gets fresh
|
||||
date/time grounding for relative-date reasoning.
|
||||
"""
|
||||
return {
|
||||
"role": "user",
|
||||
"content": (
|
||||
"[Context — current date/time, refreshed each turn; not part of "
|
||||
"your instructions]\n" + current_datetime_prompt(now_utc)
|
||||
),
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user