Merge pull request #4280 from GeekLuffy/feat/llm-self-eval

feat(teacher): implement Tier 2 LLM self-evaluation
This commit is contained in:
Alexandre Teixeira
2026-06-26 18:35:01 +01:00
committed by GitHub
3 changed files with 343 additions and 8 deletions
+1
View File
@@ -152,6 +152,7 @@ DEFAULT_SETTINGS = {
"utility_model_fallbacks": [], "utility_model_fallbacks": [],
"teacher_model": "", "teacher_model": "",
"teacher_enabled": False, "teacher_enabled": False,
"teacher_tier2_enabled": False,
# Skills: minimum self-reported confidence for an auto-written (LLM-authored) # Skills: minimum self-reported confidence for an auto-written (LLM-authored)
# DRAFT skill to be injected into the agent prompt. Published skills always # DRAFT skill to be injected into the agent prompt. Published skills always
# qualify. Keeps low-confidence auto-skills out of context until they're # qualify. Keeps low-confidence auto-skills out of context until they're
+102 -7
View File
@@ -366,6 +366,71 @@ def _format_trace(tool_results: List[Dict[str, Any]], agent_reply: str) -> str:
return f"<<<UNTRUSTED_TRACE>>>\n{trace}\n<<<END_UNTRUSTED_TRACE>>>" return f"<<<UNTRUSTED_TRACE>>>\n{trace}\n<<<END_UNTRUSTED_TRACE>>>"
_EVALUATE_TURN_LLM_PROMPT = """\
You are an independent auditor evaluating a student AI agent's turn.
Given the original request, the trace of tool calls and results, and the agent's final reply, determine whether the agent failed, gave up because it lacks the tools/capability/information, or encountered an error.
Respond with exactly one of these two words:
- "failure" if the agent failed, gave up, encountered an error, or asked the user for clarification/missing tools.
- "ok" if the agent successfully completed the task or is making correct progress.
ORIGINAL USER REQUEST:
{user_request}
AGENT TRACE:
{trace}
AGENT REPLY:
{agent_reply}
EVALUATION:"""
async def evaluate_turn_llm(
user_request: str,
tool_results: List[Dict[str, Any]],
agent_reply: str,
student_endpoint_url: str,
owner: Optional[str] = None,
) -> Tuple[str, Optional[str]]:
"""Use a fast LLM (resolved via utility endpoint) to evaluate a turn."""
from src.endpoint_resolver import resolve_endpoint
from src.llm_core import llm_call_async
# Resolve utility model (falls back to default model, then student_endpoint_url)
url, model, headers = resolve_endpoint(
"utility",
fallback_url=student_endpoint_url,
owner=owner
)
if not url or not model:
return ("ok", None)
trace_str = _format_trace(tool_results, agent_reply)
prompt = _EVALUATE_TURN_LLM_PROMPT.format(
user_request=user_request or "(no user request)",
trace=trace_str,
agent_reply=agent_reply or "(no agent reply)",
)
try:
response = await llm_call_async(
url, model,
[{"role": "user", "content": prompt}],
headers=headers,
timeout=20,
)
if response:
cleaned_response = response.strip().strip("'\"").lower()
if cleaned_response == "failure":
return ("failure", f"LLM evaluation flagged failure: {response.strip()}")
except Exception as e:
logger.warning(f"Tier 2 LLM self-eval failed: {e}")
return ("ok", None)
async def escalate_and_learn( async def escalate_and_learn(
user_request: str, user_request: str,
tool_results: List[Dict[str, Any]], tool_results: List[Dict[str, Any]],
@@ -459,15 +524,34 @@ def maybe_escalate(
# Gate 3: regex eval — only escalate on detected failure. # Gate 3: regex eval — only escalate on detected failure.
status, reason = evaluate_turn_regex(tool_results, agent_reply) status, reason = evaluate_turn_regex(tool_results, agent_reply)
if status != "failure": if status == "failure":
return None
# Fire async — don't block the user's chat. # Fire async — don't block the user's chat.
return asyncio.create_task( return asyncio.create_task(
escalate_and_learn(user_request, tool_results, agent_reply, reason or "", owner), escalate_and_learn(user_request, tool_results, agent_reply, reason or "", owner),
name="teacher_escalation", name="teacher_escalation",
) )
# Gate 4: Tier 2 LLM self-evaluation requires teacher_tier2_enabled
if not get_setting("teacher_tier2_enabled", False):
return None
# Tier 2: LLM self-evaluation background task
async def evaluate_and_maybe_escalate():
llm_status, llm_reason = await evaluate_turn_llm(
user_request=user_request,
tool_results=tool_results,
agent_reply=agent_reply,
student_endpoint_url=student_endpoint_url,
owner=owner,
)
if llm_status == "failure":
await escalate_and_learn(user_request, tool_results, agent_reply, llm_reason or "", owner)
return asyncio.create_task(
evaluate_and_maybe_escalate(),
name="teacher_escalation_tier2",
)
# ── Inline teacher takeover (visible in chat stream) ─────────────── # ── Inline teacher takeover (visible in chat stream) ───────────────
@@ -501,10 +585,6 @@ async def run_teacher_inline(
except Exception: except Exception:
return return
status, reason = evaluate_turn_regex(student_tool_events, student_reply)
if status != "failure":
return
# Extract original user request — last user-role message # Extract original user request — last user-role message
user_request = "" user_request = ""
for m in reversed(student_messages): for m in reversed(student_messages):
@@ -521,6 +601,21 @@ async def run_teacher_inline(
) )
break break
status, reason = evaluate_turn_regex(student_tool_events, student_reply)
if status != "failure":
# Tier 2: LLM self-evaluation check requires teacher_tier2_enabled
if not get_setting("teacher_tier2_enabled", False):
return
status, reason = await evaluate_turn_llm(
user_request=user_request,
tool_results=student_tool_events,
agent_reply=student_reply,
student_endpoint_url=student_endpoint_url,
owner=owner,
)
if status != "failure":
return
# Resolve teacher endpoint # Resolve teacher endpoint
try: try:
from src.ai_interaction import _resolve_model from src.ai_interaction import _resolve_model
+239
View File
@@ -0,0 +1,239 @@
import asyncio
from types import SimpleNamespace
import pytest
import src.teacher_escalation as teacher_escalation
@pytest.mark.asyncio
async def test_evaluate_turn_llm_ok(monkeypatch):
seen = {}
def fake_resolve_endpoint(prefix, fallback_url=None, owner=None):
seen["prefix"] = prefix
seen["owner"] = owner
return "http://endpoint.local/v1", "utility-model", {}
async def fake_llm_call_async(url, model, messages, **kwargs):
seen["called"] = True
return "ok"
monkeypatch.setattr("src.endpoint_resolver.resolve_endpoint", fake_resolve_endpoint)
monkeypatch.setattr("src.llm_core.llm_call_async", fake_llm_call_async)
status, reason = await teacher_escalation.evaluate_turn_llm(
user_request="test request",
tool_results=[],
agent_reply="test reply",
student_endpoint_url="http://student.local/v1",
owner="alice",
)
assert status == "ok"
assert reason is None
assert seen["prefix"] == "utility"
assert seen["owner"] == "alice"
assert seen["called"] is True
@pytest.mark.asyncio
async def test_evaluate_turn_llm_failure(monkeypatch):
def fake_resolve_endpoint(prefix, fallback_url=None, owner=None):
return "http://endpoint.local/v1", "utility-model", {}
async def fake_llm_call_async(url, model, messages, **kwargs):
return " \"Failure\" "
monkeypatch.setattr("src.endpoint_resolver.resolve_endpoint", fake_resolve_endpoint)
monkeypatch.setattr("src.llm_core.llm_call_async", fake_llm_call_async)
status, reason = await teacher_escalation.evaluate_turn_llm(
user_request="test request",
tool_results=[],
agent_reply="test reply",
student_endpoint_url="http://student.local/v1",
owner="alice",
)
assert status == "failure"
assert "LLM evaluation flagged failure" in reason
@pytest.mark.asyncio
async def test_evaluate_turn_llm_contains_failure_but_not_exact_match(monkeypatch):
def fake_resolve_endpoint(prefix, fallback_url=None, owner=None):
return "http://endpoint.local/v1", "utility-model", {}
async def fake_llm_call_async(url, model, messages, **kwargs):
return "this agent execution is not a failure"
monkeypatch.setattr("src.endpoint_resolver.resolve_endpoint", fake_resolve_endpoint)
monkeypatch.setattr("src.llm_core.llm_call_async", fake_llm_call_async)
status, reason = await teacher_escalation.evaluate_turn_llm(
user_request="test request",
tool_results=[],
agent_reply="test reply",
student_endpoint_url="http://student.local/v1",
owner="alice",
)
assert status == "ok"
assert reason is None
@pytest.mark.asyncio
async def test_evaluate_turn_llm_exception_handling(monkeypatch):
def fake_resolve_endpoint(prefix, fallback_url=None, owner=None):
return "http://endpoint.local/v1", "utility-model", {}
async def fake_llm_call_async(url, model, messages, **kwargs):
raise RuntimeError("model timeout")
monkeypatch.setattr("src.endpoint_resolver.resolve_endpoint", fake_resolve_endpoint)
monkeypatch.setattr("src.llm_core.llm_call_async", fake_llm_call_async)
# Should degrade gracefully to "ok"
status, reason = await teacher_escalation.evaluate_turn_llm(
user_request="test request",
tool_results=[],
agent_reply="test reply",
student_endpoint_url="http://student.local/v1",
owner="alice",
)
assert status == "ok"
assert reason is None
@pytest.mark.asyncio
async def test_maybe_escalate_triggers_tier2_background_task(monkeypatch):
# Enable teacher settings
monkeypatch.setattr("src.settings.get_setting", lambda key, default=None: {"teacher_enabled": True, "teacher_model": "teacher-model", "teacher_tier2_enabled": True}.get(key, default))
# Regex check says OK
monkeypatch.setattr("src.teacher_escalation.evaluate_turn_regex", lambda *args: ("ok", None))
llm_eval_called = []
async def fake_evaluate_turn_llm(*args, **kwargs):
llm_eval_called.append(True)
return "failure", "LLM flagged failure"
monkeypatch.setattr("src.teacher_escalation.evaluate_turn_llm", fake_evaluate_turn_llm)
escalate_called = []
async def fake_escalate_and_learn(user_request, tool_results, agent_reply, failure_reason, owner):
escalate_called.append(failure_reason)
return "skill-slug"
monkeypatch.setattr("src.teacher_escalation.escalate_and_learn", fake_escalate_and_learn)
# Call maybe_escalate
task = teacher_escalation.maybe_escalate(
student_endpoint_url="http://student.local/v1",
mode="agent",
user_request="test request",
tool_results=[],
agent_reply="test reply",
owner="alice",
)
assert task is not None
assert task.get_name() == "teacher_escalation_tier2"
# Await the background task execution
await task
assert llm_eval_called == [True]
assert escalate_called == ["LLM flagged failure"]
@pytest.mark.asyncio
async def test_maybe_escalate_tier2_disabled_by_default(monkeypatch):
# Enable teacher settings, but keep tier2 disabled
monkeypatch.setattr("src.settings.get_setting", lambda key, default=None: {"teacher_enabled": True, "teacher_model": "teacher-model", "teacher_tier2_enabled": False}.get(key, default))
# Regex check says OK
monkeypatch.setattr("src.teacher_escalation.evaluate_turn_regex", lambda *args: ("ok", None))
# Call maybe_escalate
task = teacher_escalation.maybe_escalate(
student_endpoint_url="http://student.local/v1",
mode="agent",
user_request="test request",
tool_results=[],
agent_reply="test reply",
owner="alice",
)
# Should not start any background task since Tier 2 is disabled
assert task is None
@pytest.mark.asyncio
async def test_run_teacher_inline_triggers_tier2_escalation(monkeypatch):
# Settings and gates
monkeypatch.setattr("src.settings.get_setting", lambda key, default=None: {"teacher_enabled": True, "teacher_model": "teacher-model", "teacher_tier2_enabled": True}.get(key, default))
monkeypatch.setattr("src.ai_interaction._resolve_model", lambda spec, owner=None: ("http://teacher.local/v1", "teacher-model", {}))
# Regex evaluation says "ok"
monkeypatch.setattr("src.teacher_escalation.evaluate_turn_regex", lambda *args: ("ok", None))
# LLM evaluation flags "failure"
async def fake_evaluate_turn_llm(*args, **kwargs):
return "failure", "LLM flagged failure"
monkeypatch.setattr("src.teacher_escalation.evaluate_turn_llm", fake_evaluate_turn_llm)
# Mock stream_agent_loop recursively called by run_teacher_inline
async def fake_stream_agent_loop(*args, **kwargs):
yield "data: {\"type\": \"tool_output\", \"tool\": \"bash\"}\n\n"
yield "data: {\"type\": \"text\", \"delta\": \"Teacher reply\"}\n\n"
yield "data: [DONE]\n\n"
monkeypatch.setattr("src.agent_loop.stream_agent_loop", fake_stream_agent_loop)
# Mock _call_teacher returning a skill definition
async def fake_call_teacher(spec, prompt, owner=None):
return '```json\n{"action": "add", "name": "test-skill"}\n```'
monkeypatch.setattr("src.teacher_escalation._call_teacher", fake_call_teacher)
# Mock do_manage_skills
async def fake_do_manage_skills(skill_json, owner=None):
return {"success": True}
monkeypatch.setattr("src.tool_implementations.do_manage_skills", fake_do_manage_skills)
events = []
async for evt in teacher_escalation.run_teacher_inline(
student_endpoint_url="http://student.local/v1",
student_messages=[{"role": "user", "content": "test request"}],
student_tool_events=[],
student_reply="student reply",
owner="alice",
):
events.append(evt)
# Make sure teacher takeover was announced and executed
assert any("teacher_takeover" in evt for evt in events)
assert any("tool_output" in evt for evt in events)
assert any("skill_saved" in evt for evt in events)
@pytest.mark.asyncio
async def test_run_teacher_inline_tier2_disabled_by_default(monkeypatch):
# Settings and gates (Tier 2 disabled)
monkeypatch.setattr("src.settings.get_setting", lambda key, default=None: {"teacher_enabled": True, "teacher_model": "teacher-model", "teacher_tier2_enabled": False}.get(key, default))
# Regex evaluation says "ok"
monkeypatch.setattr("src.teacher_escalation.evaluate_turn_regex", lambda *args: ("ok", None))
events = []
async for evt in teacher_escalation.run_teacher_inline(
student_endpoint_url="http://student.local/v1",
student_messages=[{"role": "user", "content": "test request"}],
student_tool_events=[],
student_reply="student reply",
owner="alice",
):
events.append(evt)
# Should exit early without any events (no takeover)
assert len(events) == 0