diff --git a/src/llm_core.py b/src/llm_core.py index 26b5f96e7..89c153809 100644 --- a/src/llm_core.py +++ b/src/llm_core.py @@ -681,6 +681,27 @@ def _restricts_temperature(model: str) -> bool: m = model.lower() return any(m.startswith(p) or f"/{p}" in m for p in _FIXED_TEMPERATURE_MODELS) +# Anthropic removed the sampling parameters (temperature, top_p, top_k) starting +# with Claude Opus 4.7. On Opus 4.7 and later, sending `temperature` at all — +# even 0.0 — returns HTTP 400. Earlier Claude models (Opus 4.6 and below, every +# Sonnet/Haiku) still accept temperature in [0.0, 1.0], so the omission must be +# version-gated rather than applied to all `claude-*` models. +def _anthropic_rejects_temperature(model: str) -> bool: + """Check if a native-Anthropic model rejects the temperature field (Opus 4.7+).""" + if not isinstance(model, str) or not model: + return False + # `(?= 4.7. Dated 4.7+ snapshots (`claude-opus-4-7- + # 20260201`) keep their explicit minor and are still matched. + match = re.search(r"(?= (4, 7) + # Models that support structured thinking — may output without opening tag _THINKING_MODEL_PATTERNS = ("qwen3", "qwq", "deepseek-r1", "deepseek-reasoner", "minimax", "m2-reap", "gemma") @@ -784,8 +805,11 @@ def _build_anthropic_payload(model, messages, temperature, max_tokens, stream=Fa "model": model, "messages": chat_messages, "max_tokens": max_tokens if max_tokens and max_tokens > 0 else 4096, - "temperature": temperature, } + # Opus 4.7+ removed the sampling parameters — sending `temperature` (even 0.0) + # returns HTTP 400. Omit it for those models; older Claude models still take it. + if not _anthropic_rejects_temperature(model): + payload["temperature"] = temperature if system_parts: system_text = "\n\n".join(system_parts) # Send `system` as a structured text block so we can attach a prompt-cache diff --git a/tests/test_llm_core_anthropic_temp_omit.py b/tests/test_llm_core_anthropic_temp_omit.py new file mode 100644 index 000000000..2274f1dc9 --- /dev/null +++ b/tests/test_llm_core_anthropic_temp_omit.py @@ -0,0 +1,94 @@ +"""Regression guard: Opus 4.7+ rejects the temperature field entirely. + +Anthropic removed the sampling parameters (temperature, top_p, top_k) starting +with Claude Opus 4.7 — sending `temperature` at all, even 0.0, returns HTTP 400. +This broke every native-Anthropic call to Opus 4.7/4.8, including the research +endpoint probe (temperature=0) and all DeepResearcher LLM calls, because +_build_anthropic_payload sent `temperature` unconditionally. + +Earlier Claude models (Opus 4.6 and below, every Sonnet/Haiku) still accept +temperature in [0.0, 1.0], so the omission is version-gated — the clamp-to-[0,1] +behavior for those models (test_llm_core_anthropic_temp_clamp.py) is unchanged. +""" +import os + +os.environ.setdefault("DATABASE_URL", "sqlite:///:memory:") + +import pytest + +from src.llm_core import _anthropic_rejects_temperature, _build_anthropic_payload + + +@pytest.mark.parametrize( + "model", + [ + "claude-opus-4-7", + "claude-opus-4-8", + "claude-opus-4-8-20260101", # tolerate a dated snapshot suffix + "claude-opus-4-7-20260201", # dated 4.7 snapshot — explicit minor, still >= 4.7 + "anthropic/claude-opus-4-7", # tolerate a provider-prefixed id + "claude-opus-4-10", # future minor still >= 4.7 + "claude-opus-5-0", # future major + ], +) +def test_opus_47_plus_rejects_temperature(model): + assert _anthropic_rejects_temperature(model) is True + + +@pytest.mark.parametrize( + "model", + [ + "claude-opus-4-6", + "claude-opus-4-5", + "claude-opus-4-1", + "claude-opus-4-0", + "claude-opus-4", # bare major (no minor) — kept + "claude-opus-4-20250514", # Opus 4.0 dated id — the date must NOT read as a 4.7+ minor + "claude-opus-4-1-20250805", # Opus 4.1 dated id — explicit minor before the date + "claude-opus-4-6-20251201", # dated 4.6 snapshot — older, still keeps temperature + "claude-sonnet-4-6", + "claude-3-5-sonnet", + "claude-3-opus-20240229", # legacy Claude 3 Opus — no opus-N-M pattern, kept + "claude-haiku-4-5", + "claude-x", + "octopus-4-8", # "opus" only as a substring of another word — must not match + "myproxy/octopus-4-8", # same, behind a provider prefix + "", + None, + ], +) +def test_older_claude_models_keep_temperature(model): + assert _anthropic_rejects_temperature(model) is False + + +@pytest.mark.parametrize("model", [123, 1.5, ["claude-opus-4-8"], {"a": 1}, object()]) +def test_non_string_model_is_handled_without_crashing(model): + # Defensive: the gate must not raise on a non-string model (the old builder + # never called .lower() on it). Truthy non-strings should classify as False. + assert _anthropic_rejects_temperature(model) is False + + +def _payload(model, temperature=0.0): + return _build_anthropic_payload( + model, [{"role": "user", "content": "hi"}], temperature, 100 + ) + + +def test_payload_omits_temperature_for_opus_47_plus(): + # The endpoint probe sends temperature=0; on Opus 4.7+ that field must be gone. + payload = _payload("claude-opus-4-8", 0.0) + assert "temperature" not in payload + + +def test_payload_keeps_temperature_for_older_models(): + payload = _payload("claude-opus-4-6", 0.3) + assert payload["temperature"] == 0.3 + # Older models retain the [0,1] clamp (Nietzsche preset at 1.2 -> 1.0). + assert _payload("claude-3-5-sonnet", 1.2)["temperature"] == 1.0 + + +def test_payload_keeps_temperature_for_dated_opus_4_0(): + # Anthropic's dated id for Opus 4.0 (claude-opus-4-20250514) is in this repo's + # ANTHROPIC_MODELS list. The date must not be misread as a >= 4.7 minor, or the + # user's temperature would be silently dropped on a model that accepts it. + assert _payload("claude-opus-4-20250514", 0.5)["temperature"] == 0.5