Providers: omit temperature for OpenAI reasoning models

* fix: omit temperature for OpenAI reasoning models (o1/o3/o4/gpt-5)

These models only accept the default temperature; sending any explicit
value (even 0.0) returns HTTP 400 "Only the default (1) value is
supported". This broke two paths:

- Endpoint probing in _probe_single_model hardcodes temperature: 0.0, so
  a perfectly valid o3/gpt-5 endpoint is reported as failing in the
  Model Endpoints health check.
- Chat/stream payloads send temperature unconditionally, so a non-default
  temperature preset 400s on these models.

The code already special-cases the same model family for
max_completion_tokens, so this adds a sibling _restricts_temperature()
helper and omits the field for those models, letting the API use its
required default. gpt-4.5 is intentionally excluded (not a reasoning
model; accepts temperature normally).

Adds tests/test_llm_core_temperature.py covering the predicate and the
synchronous payload builder.

* fix: also omit temperature for reasoning models on the direct-POST paths

The first commit only covered llm_call/llm_call_async/stream_llm and the
endpoint probe. Email auto-summary, urgency-less spam classification, the
email reply-summary endpoint, and gallery vision tagging build their
OpenAI payloads inline and POST them directly (requests/httpx), bypassing
llm_core — so a reasoning model configured there would still 400 on the
temperature field. These sites already branch on _uses_max_completion_tokens,
so they're the same class; added the matching _restricts_temperature guard.

gallery_routes also gains the max_completion_tokens branch it was missing,
so gpt-5 vision tagging works end to end.

Note: email_pollers urgency scoring goes through llm_call_async and was
already covered.
This commit is contained in:
SurprisedDuck
2026-06-02 13:58:33 +02:00
committed by GitHub
parent 119075f368
commit 934bca9e48
6 changed files with 113 additions and 6 deletions
+7 -1
View File
@@ -132,7 +132,7 @@ async def _auto_summarize_pass_single(days_back: int = 1, account_id: str | None
import sqlite3 as _sql3
import requests as _req
from src.endpoint_resolver import resolve_endpoint
from src.llm_core import _uses_max_completion_tokens
from src.llm_core import _uses_max_completion_tokens, _restricts_temperature
settings = _load_settings()
auto_sum = settings.get("email_auto_summarize", False)
@@ -355,6 +355,9 @@ async def _auto_summarize_pass_single(days_back: int = 1, account_id: str | None
"temperature": 0.3,
"stream": False,
}
# Reasoning models (o1/o3/o4/gpt-5) reject an explicit temperature.
if _restricts_temperature(model):
payload.pop("temperature", None)
try:
# Use to_thread so this sync HTTP call doesn't freeze
# the entire event loop while the LLM thinks (240s).
@@ -806,6 +809,9 @@ async def _auto_summarize_pass_single(days_back: int = 1, account_id: str | None
"temperature": 0.1,
"stream": False,
}
# Reasoning models (o1/o3/o4/gpt-5) reject an explicit temperature.
if _restricts_temperature(model):
payload.pop("temperature", None)
# to_thread keeps the event loop responsive during the LLM call
resp = await asyncio.to_thread(
_req.post, url, json=payload, headers=req_headers, timeout=120
+4 -1
View File
@@ -2419,7 +2419,7 @@ def setup_email_routes():
"""Generate a quick AI summary of an email body."""
try:
from src.endpoint_resolver import resolve_endpoint
from src.llm_core import _uses_max_completion_tokens
from src.llm_core import _uses_max_completion_tokens, _restricts_temperature
import requests as _req
body = data.get("body", "")
@@ -2476,6 +2476,9 @@ def setup_email_routes():
"temperature": 0.3,
"stream": False,
}
# Reasoning models (o1/o3/o4/gpt-5) reject an explicit temperature.
if _restricts_temperature(model):
payload.pop("temperature", None)
resp = await asyncio.to_thread(
_req.post, url, json=payload, headers=req_headers, timeout=180
)
+6 -2
View File
@@ -1707,7 +1707,7 @@ def setup_gallery_routes() -> APIRouter:
return {"error": "No vision-capable endpoint configured"}
# Call vision model — format differs between Anthropic and OpenAI
from src.llm_core import _detect_provider
from src.llm_core import _detect_provider, _restricts_temperature, _uses_max_completion_tokens
provider = _detect_provider(chat_url)
tag_prompt = (
"Analyze this photo. Return ONLY a comma-separated list of tags. "
@@ -1732,6 +1732,7 @@ def setup_gallery_routes() -> APIRouter:
}],
}
else:
_tok_key = "max_completion_tokens" if _uses_max_completion_tokens(model_name) else "max_tokens"
payload = {
"model": model_name,
"messages": [{
@@ -1741,9 +1742,12 @@ def setup_gallery_routes() -> APIRouter:
{"type": "image_url", "image_url": {"url": f"data:{mime};base64,{b64}"}},
],
}],
"max_tokens": 200,
_tok_key: 200,
"temperature": 0.3,
}
# Reasoning models (o1/o3/o4/gpt-5) reject an explicit temperature.
if _restricts_temperature(model_name):
payload.pop("temperature", None)
h = {"Content-Type": "application/json"}
if headers:
+6 -2
View File
@@ -251,9 +251,13 @@ def _probe_single_model(base: str, api_key: str, model_id: str, timeout: int = 1
target_url = build_chat_url(base)
h = build_headers(api_key, base)
h["Content-Type"] = "application/json"
from src.llm_core import _uses_max_completion_tokens
from src.llm_core import _uses_max_completion_tokens, _restricts_temperature
_max_key = "max_completion_tokens" if _uses_max_completion_tokens(model_id) else "max_tokens"
payload = {"model": model_id, "messages": messages, _max_key: 5, "temperature": 0.0}
payload = {"model": model_id, "messages": messages, _max_key: 5}
# Reasoning models (o1/o3/o4/gpt-5) reject an explicit temperature, so a
# probe that hardcodes one falsely reports a working endpoint as failing.
if not _restricts_temperature(model_id):
payload["temperature"] = 0.0
if _test_tools:
payload["tools"] = _test_tools