mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-17 10:15:27 -04:00
Providers: omit temperature for OpenAI reasoning models
* fix: omit temperature for OpenAI reasoning models (o1/o3/o4/gpt-5) These models only accept the default temperature; sending any explicit value (even 0.0) returns HTTP 400 "Only the default (1) value is supported". This broke two paths: - Endpoint probing in _probe_single_model hardcodes temperature: 0.0, so a perfectly valid o3/gpt-5 endpoint is reported as failing in the Model Endpoints health check. - Chat/stream payloads send temperature unconditionally, so a non-default temperature preset 400s on these models. The code already special-cases the same model family for max_completion_tokens, so this adds a sibling _restricts_temperature() helper and omits the field for those models, letting the API use its required default. gpt-4.5 is intentionally excluded (not a reasoning model; accepts temperature normally). Adds tests/test_llm_core_temperature.py covering the predicate and the synchronous payload builder. * fix: also omit temperature for reasoning models on the direct-POST paths The first commit only covered llm_call/llm_call_async/stream_llm and the endpoint probe. Email auto-summary, urgency-less spam classification, the email reply-summary endpoint, and gallery vision tagging build their OpenAI payloads inline and POST them directly (requests/httpx), bypassing llm_core — so a reasoning model configured there would still 400 on the temperature field. These sites already branch on _uses_max_completion_tokens, so they're the same class; added the matching _restricts_temperature guard. gallery_routes also gains the max_completion_tokens branch it was missing, so gpt-5 vision tagging works end to end. Note: email_pollers urgency scoring goes through llm_call_async and was already covered.
This commit is contained in:
@@ -132,7 +132,7 @@ async def _auto_summarize_pass_single(days_back: int = 1, account_id: str | None
|
|||||||
import sqlite3 as _sql3
|
import sqlite3 as _sql3
|
||||||
import requests as _req
|
import requests as _req
|
||||||
from src.endpoint_resolver import resolve_endpoint
|
from src.endpoint_resolver import resolve_endpoint
|
||||||
from src.llm_core import _uses_max_completion_tokens
|
from src.llm_core import _uses_max_completion_tokens, _restricts_temperature
|
||||||
|
|
||||||
settings = _load_settings()
|
settings = _load_settings()
|
||||||
auto_sum = settings.get("email_auto_summarize", False)
|
auto_sum = settings.get("email_auto_summarize", False)
|
||||||
@@ -355,6 +355,9 @@ async def _auto_summarize_pass_single(days_back: int = 1, account_id: str | None
|
|||||||
"temperature": 0.3,
|
"temperature": 0.3,
|
||||||
"stream": False,
|
"stream": False,
|
||||||
}
|
}
|
||||||
|
# Reasoning models (o1/o3/o4/gpt-5) reject an explicit temperature.
|
||||||
|
if _restricts_temperature(model):
|
||||||
|
payload.pop("temperature", None)
|
||||||
try:
|
try:
|
||||||
# Use to_thread so this sync HTTP call doesn't freeze
|
# Use to_thread so this sync HTTP call doesn't freeze
|
||||||
# the entire event loop while the LLM thinks (240s).
|
# the entire event loop while the LLM thinks (240s).
|
||||||
@@ -806,6 +809,9 @@ async def _auto_summarize_pass_single(days_back: int = 1, account_id: str | None
|
|||||||
"temperature": 0.1,
|
"temperature": 0.1,
|
||||||
"stream": False,
|
"stream": False,
|
||||||
}
|
}
|
||||||
|
# Reasoning models (o1/o3/o4/gpt-5) reject an explicit temperature.
|
||||||
|
if _restricts_temperature(model):
|
||||||
|
payload.pop("temperature", None)
|
||||||
# to_thread keeps the event loop responsive during the LLM call
|
# to_thread keeps the event loop responsive during the LLM call
|
||||||
resp = await asyncio.to_thread(
|
resp = await asyncio.to_thread(
|
||||||
_req.post, url, json=payload, headers=req_headers, timeout=120
|
_req.post, url, json=payload, headers=req_headers, timeout=120
|
||||||
|
|||||||
@@ -2419,7 +2419,7 @@ def setup_email_routes():
|
|||||||
"""Generate a quick AI summary of an email body."""
|
"""Generate a quick AI summary of an email body."""
|
||||||
try:
|
try:
|
||||||
from src.endpoint_resolver import resolve_endpoint
|
from src.endpoint_resolver import resolve_endpoint
|
||||||
from src.llm_core import _uses_max_completion_tokens
|
from src.llm_core import _uses_max_completion_tokens, _restricts_temperature
|
||||||
import requests as _req
|
import requests as _req
|
||||||
|
|
||||||
body = data.get("body", "")
|
body = data.get("body", "")
|
||||||
@@ -2476,6 +2476,9 @@ def setup_email_routes():
|
|||||||
"temperature": 0.3,
|
"temperature": 0.3,
|
||||||
"stream": False,
|
"stream": False,
|
||||||
}
|
}
|
||||||
|
# Reasoning models (o1/o3/o4/gpt-5) reject an explicit temperature.
|
||||||
|
if _restricts_temperature(model):
|
||||||
|
payload.pop("temperature", None)
|
||||||
resp = await asyncio.to_thread(
|
resp = await asyncio.to_thread(
|
||||||
_req.post, url, json=payload, headers=req_headers, timeout=180
|
_req.post, url, json=payload, headers=req_headers, timeout=180
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1707,7 +1707,7 @@ def setup_gallery_routes() -> APIRouter:
|
|||||||
return {"error": "No vision-capable endpoint configured"}
|
return {"error": "No vision-capable endpoint configured"}
|
||||||
|
|
||||||
# Call vision model — format differs between Anthropic and OpenAI
|
# Call vision model — format differs between Anthropic and OpenAI
|
||||||
from src.llm_core import _detect_provider
|
from src.llm_core import _detect_provider, _restricts_temperature, _uses_max_completion_tokens
|
||||||
provider = _detect_provider(chat_url)
|
provider = _detect_provider(chat_url)
|
||||||
tag_prompt = (
|
tag_prompt = (
|
||||||
"Analyze this photo. Return ONLY a comma-separated list of tags. "
|
"Analyze this photo. Return ONLY a comma-separated list of tags. "
|
||||||
@@ -1732,6 +1732,7 @@ def setup_gallery_routes() -> APIRouter:
|
|||||||
}],
|
}],
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
|
_tok_key = "max_completion_tokens" if _uses_max_completion_tokens(model_name) else "max_tokens"
|
||||||
payload = {
|
payload = {
|
||||||
"model": model_name,
|
"model": model_name,
|
||||||
"messages": [{
|
"messages": [{
|
||||||
@@ -1741,9 +1742,12 @@ def setup_gallery_routes() -> APIRouter:
|
|||||||
{"type": "image_url", "image_url": {"url": f"data:{mime};base64,{b64}"}},
|
{"type": "image_url", "image_url": {"url": f"data:{mime};base64,{b64}"}},
|
||||||
],
|
],
|
||||||
}],
|
}],
|
||||||
"max_tokens": 200,
|
_tok_key: 200,
|
||||||
"temperature": 0.3,
|
"temperature": 0.3,
|
||||||
}
|
}
|
||||||
|
# Reasoning models (o1/o3/o4/gpt-5) reject an explicit temperature.
|
||||||
|
if _restricts_temperature(model_name):
|
||||||
|
payload.pop("temperature", None)
|
||||||
|
|
||||||
h = {"Content-Type": "application/json"}
|
h = {"Content-Type": "application/json"}
|
||||||
if headers:
|
if headers:
|
||||||
|
|||||||
@@ -251,9 +251,13 @@ def _probe_single_model(base: str, api_key: str, model_id: str, timeout: int = 1
|
|||||||
target_url = build_chat_url(base)
|
target_url = build_chat_url(base)
|
||||||
h = build_headers(api_key, base)
|
h = build_headers(api_key, base)
|
||||||
h["Content-Type"] = "application/json"
|
h["Content-Type"] = "application/json"
|
||||||
from src.llm_core import _uses_max_completion_tokens
|
from src.llm_core import _uses_max_completion_tokens, _restricts_temperature
|
||||||
_max_key = "max_completion_tokens" if _uses_max_completion_tokens(model_id) else "max_tokens"
|
_max_key = "max_completion_tokens" if _uses_max_completion_tokens(model_id) else "max_tokens"
|
||||||
payload = {"model": model_id, "messages": messages, _max_key: 5, "temperature": 0.0}
|
payload = {"model": model_id, "messages": messages, _max_key: 5}
|
||||||
|
# Reasoning models (o1/o3/o4/gpt-5) reject an explicit temperature, so a
|
||||||
|
# probe that hardcodes one falsely reports a working endpoint as failing.
|
||||||
|
if not _restricts_temperature(model_id):
|
||||||
|
payload["temperature"] = 0.0
|
||||||
if _test_tools:
|
if _test_tools:
|
||||||
payload["tools"] = _test_tools
|
payload["tools"] = _test_tools
|
||||||
|
|
||||||
|
|||||||
@@ -403,6 +403,22 @@ def _uses_max_completion_tokens(model: str) -> bool:
|
|||||||
m = model.lower()
|
m = model.lower()
|
||||||
return any(m.startswith(p) or f"/{p}" in m for p in _MAX_COMPLETION_TOKENS_MODELS)
|
return any(m.startswith(p) or f"/{p}" in m for p in _MAX_COMPLETION_TOKENS_MODELS)
|
||||||
|
|
||||||
|
# OpenAI reasoning models (o1, o3, o4, gpt-5 families) only accept the default
|
||||||
|
# temperature. Sending any explicit value — even 0.0 — returns HTTP 400
|
||||||
|
# ("Only the default (1) value is supported"). That otherwise breaks chat when a
|
||||||
|
# preset sets a non-default temperature, and makes endpoint probing report a
|
||||||
|
# perfectly good model as failing. For these models we omit the field and let
|
||||||
|
# the API use its required default. (gpt-4.5 is intentionally excluded — it is
|
||||||
|
# not a reasoning model and accepts temperature normally.)
|
||||||
|
_FIXED_TEMPERATURE_MODELS = ("o1", "o3", "o4", "gpt-5")
|
||||||
|
|
||||||
|
def _restricts_temperature(model: str) -> bool:
|
||||||
|
"""Check if a model rejects any non-default temperature."""
|
||||||
|
if not model:
|
||||||
|
return False
|
||||||
|
m = model.lower()
|
||||||
|
return any(m.startswith(p) or f"/{p}" in m for p in _FIXED_TEMPERATURE_MODELS)
|
||||||
|
|
||||||
# Models that support structured thinking — may output </think> without opening tag
|
# Models that support structured thinking — may output </think> without opening tag
|
||||||
_THINKING_MODEL_PATTERNS = ("qwen3", "qwq", "deepseek-r1", "deepseek-reasoner", "minimax", "m2-reap")
|
_THINKING_MODEL_PATTERNS = ("qwen3", "qwq", "deepseek-r1", "deepseek-reasoner", "minimax", "m2-reap")
|
||||||
|
|
||||||
@@ -738,6 +754,8 @@ def llm_call(url: str, model: str, messages: List[Dict], temperature: float = LL
|
|||||||
"messages": messages_copy,
|
"messages": messages_copy,
|
||||||
"temperature": temperature,
|
"temperature": temperature,
|
||||||
}
|
}
|
||||||
|
if _restricts_temperature(model):
|
||||||
|
payload.pop("temperature", None)
|
||||||
if max_tokens and max_tokens > 0:
|
if max_tokens and max_tokens > 0:
|
||||||
tok_key = "max_completion_tokens" if _uses_max_completion_tokens(model) else "max_tokens"
|
tok_key = "max_completion_tokens" if _uses_max_completion_tokens(model) else "max_tokens"
|
||||||
payload[tok_key] = max_tokens
|
payload[tok_key] = max_tokens
|
||||||
@@ -857,6 +875,8 @@ async def llm_call_async(
|
|||||||
"messages": messages_copy,
|
"messages": messages_copy,
|
||||||
"temperature": temperature,
|
"temperature": temperature,
|
||||||
}
|
}
|
||||||
|
if _restricts_temperature(model):
|
||||||
|
payload.pop("temperature", None)
|
||||||
if max_tokens and max_tokens > 0:
|
if max_tokens and max_tokens > 0:
|
||||||
tok_key = "max_completion_tokens" if _uses_max_completion_tokens(model) else "max_tokens"
|
tok_key = "max_completion_tokens" if _uses_max_completion_tokens(model) else "max_tokens"
|
||||||
payload[tok_key] = max_tokens
|
payload[tok_key] = max_tokens
|
||||||
@@ -958,6 +978,8 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
|
|||||||
"temperature": temperature,
|
"temperature": temperature,
|
||||||
"stream": True,
|
"stream": True,
|
||||||
}
|
}
|
||||||
|
if _restricts_temperature(model):
|
||||||
|
payload.pop("temperature", None)
|
||||||
if provider not in {"openrouter", "groq"}:
|
if provider not in {"openrouter", "groq"}:
|
||||||
payload["stream_options"] = {"include_usage": True}
|
payload["stream_options"] = {"include_usage": True}
|
||||||
if max_tokens and max_tokens > 0:
|
if max_tokens and max_tokens > 0:
|
||||||
|
|||||||
@@ -0,0 +1,68 @@
|
|||||||
|
"""Regression tests: OpenAI reasoning models reject a non-default temperature.
|
||||||
|
|
||||||
|
o1/o3/o4/gpt-5 only accept the default temperature (1); sending an explicit
|
||||||
|
value — even 0.0 — returns HTTP 400 "Only the default (1) value is supported".
|
||||||
|
The OpenAI-compatible payload builders must omit the temperature field for these
|
||||||
|
models so chat (with a non-default preset) and endpoint probing don't break.
|
||||||
|
"""
|
||||||
|
import httpx
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from src import llm_core
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"model",
|
||||||
|
["o1", "o1-mini", "o3", "o3-mini", "o4-mini", "gpt-5", "gpt-5-mini",
|
||||||
|
"openrouter/openai/o3-mini", "OpenAI/GPT-5"],
|
||||||
|
)
|
||||||
|
def test_reasoning_models_restrict_temperature(model):
|
||||||
|
assert llm_core._restricts_temperature(model) is True
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"model",
|
||||||
|
["gpt-4o", "gpt-4.1", "gpt-3.5-turbo", "gpt-4.5-preview",
|
||||||
|
"claude-3-5-sonnet", "llama3.1", "", None],
|
||||||
|
)
|
||||||
|
def test_normal_models_allow_temperature(model):
|
||||||
|
assert llm_core._restricts_temperature(model) is False
|
||||||
|
|
||||||
|
|
||||||
|
def _capture_openai_payload(monkeypatch, model, temperature):
|
||||||
|
"""Run a synchronous OpenAI-compatible call and return the posted JSON body."""
|
||||||
|
llm_core._response_cache.clear()
|
||||||
|
seen = {}
|
||||||
|
|
||||||
|
def fake_post(url, headers=None, json=None, timeout=None):
|
||||||
|
seen["json"] = json
|
||||||
|
request = httpx.Request("POST", url)
|
||||||
|
return httpx.Response(
|
||||||
|
200,
|
||||||
|
request=request,
|
||||||
|
json={"choices": [{"message": {"content": "OK"}}]},
|
||||||
|
)
|
||||||
|
|
||||||
|
monkeypatch.setattr(llm_core.httpx, "post", fake_post)
|
||||||
|
result = llm_core.llm_call(
|
||||||
|
"https://api.openai.com/v1/chat/completions",
|
||||||
|
model,
|
||||||
|
[{"role": "user", "content": "Say OK"}],
|
||||||
|
temperature=temperature,
|
||||||
|
max_tokens=5,
|
||||||
|
)
|
||||||
|
assert result == "OK"
|
||||||
|
return seen["json"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_reasoning_model_payload_omits_temperature(monkeypatch):
|
||||||
|
payload = _capture_openai_payload(monkeypatch, "o3-mini", 0.0)
|
||||||
|
assert "temperature" not in payload
|
||||||
|
# Reasoning models also use max_completion_tokens, which must survive.
|
||||||
|
assert payload["max_completion_tokens"] == 5
|
||||||
|
|
||||||
|
|
||||||
|
def test_normal_model_payload_keeps_temperature(monkeypatch):
|
||||||
|
payload = _capture_openai_payload(monkeypatch, "gpt-4o", 0.2)
|
||||||
|
assert payload["temperature"] == 0.2
|
||||||
|
assert payload["max_tokens"] == 5
|
||||||
Reference in New Issue
Block a user