From f5d3e5098a03de77b538e2ba18151c6760ded262 Mon Sep 17 00:00:00 2001 From: Josh Patra Date: Mon, 15 Jun 2026 07:29:22 -0400 Subject: [PATCH] fix(llm): omit temperature for Kimi K2.5 and K2.6 (#3960) --- src/llm_core.py | 30 +++++++++++-- tests/test_llm_core_temperature.py | 67 +++++++++++++++++++++++++++++- 2 files changed, 92 insertions(+), 5 deletions(-) diff --git a/src/llm_core.py b/src/llm_core.py index 9dfade2cd..1338ef91a 100644 --- a/src/llm_core.py +++ b/src/llm_core.py @@ -605,6 +605,8 @@ def _detect_provider(url: str) -> str: return "groq" if _host_match(url, "nvidia.com"): return "nvidia" + if _host_match(url, "moonshot.ai") or _host_match(url, "moonshot.cn"): + return "moonshot" from src.chatgpt_subscription import is_chatgpt_subscription_base if is_chatgpt_subscription_base(url): return "chatgpt-subscription" @@ -856,6 +858,28 @@ def _restricts_temperature(model: str) -> bool: m = model.lower() return any(m.startswith(p) or f"/{p}" in m for p in _FIXED_TEMPERATURE_MODELS) + +# The official Moonshot API fixes temperature at 1.0 in thinking mode and 0.6 +# when thinking is explicitly disabled for Kimi K2.5/K2.6. Any other explicit +# value returns HTTP 400. Odysseus does not currently send the `thinking` mode +# control, so omit temperature and let Moonshot use its default thinking mode. +# Keep the gate provider-specific: self-hosted Kimi deployments may accept +# custom sampling values, and older Moonshot models have different defaults. +def _moonshot_rejects_custom_temperature(provider: str, model: str) -> bool: + """Check if the official Moonshot API fixes temperature for this model.""" + if provider != "moonshot" or not isinstance(model, str): + return False + model_id = model.lower().rsplit("/", 1)[-1] + return bool(re.match(r"^kimi-k2\.(?:5|6)(?:$|[-_:])", model_id)) + + +def _omit_temperature(provider: str, model: str) -> bool: + """Check if a request should use the provider's default temperature.""" + return _restricts_temperature(model) or _moonshot_rejects_custom_temperature( + provider, model + ) + + # Anthropic removed the sampling parameters (temperature, top_p, top_k) starting # with Claude Opus 4.7. On Opus 4.7 and later, sending `temperature` at all — # even 0.0 — returns HTTP 400. Earlier Claude models (Opus 4.6 and below, every @@ -1404,7 +1428,7 @@ def llm_call(url: str, model: str, messages: List[Dict], temperature: float = LL "messages": messages_copy, "temperature": temperature, } - if _restricts_temperature(model): + if _omit_temperature(provider, model): payload.pop("temperature", None) if max_tokens and max_tokens > 0: tok_key = "max_completion_tokens" if _uses_max_completion_tokens(model) else "max_tokens" @@ -1598,7 +1622,7 @@ async def llm_call_async( "messages": messages_copy, "temperature": temperature, } - if _restricts_temperature(model): + if _omit_temperature(provider, model): payload.pop("temperature", None) if max_tokens and max_tokens > 0: tok_key = "max_completion_tokens" if _uses_max_completion_tokens(model) else "max_tokens" @@ -1715,7 +1739,7 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl "temperature": temperature, "stream": True, } - if _restricts_temperature(model): + if _omit_temperature(provider, model): payload.pop("temperature", None) if provider not in {"openrouter", "groq"}: payload["stream_options"] = {"include_usage": True} diff --git a/tests/test_llm_core_temperature.py b/tests/test_llm_core_temperature.py index 685313011..ab6334f36 100644 --- a/tests/test_llm_core_temperature.py +++ b/tests/test_llm_core_temperature.py @@ -29,7 +29,12 @@ def test_normal_models_allow_temperature(model): assert llm_core._restricts_temperature(model) is False -def _capture_openai_payload(monkeypatch, model, temperature): +def _capture_openai_payload( + monkeypatch, + model, + temperature, + url="https://api.openai.com/v1/chat/completions", +): """Run a synchronous OpenAI-compatible call and return the posted JSON body.""" llm_core._response_cache.clear() seen = {} @@ -45,7 +50,7 @@ def _capture_openai_payload(monkeypatch, model, temperature): monkeypatch.setattr(llm_core.httpx, "post", fake_post) result = llm_core.llm_call( - "https://api.openai.com/v1/chat/completions", + url, model, [{"role": "user", "content": "Say OK"}], temperature=temperature, @@ -131,3 +136,61 @@ def test_anthropic_payload_clamps_negative(): def test_anthropic_payload_none_temperature_does_not_crash(): payload = _anthropic_payload(None) assert payload["temperature"] is None + + +@pytest.mark.parametrize( + "model", + [ + "kimi-k2.5", + "kimi-k2.6", + "moonshot/kimi-k2.6", + "kimi-k2.6-preview", + ], +) +def test_moonshot_k2_5_plus_uses_fixed_temperature(model): + assert llm_core._moonshot_rejects_custom_temperature("moonshot", model) + + +@pytest.mark.parametrize( + "provider,model", + [ + ("openai", "kimi-k2.6"), + ("moonshot", "kimi-k2-0905-preview"), + ("moonshot", "kimi-k2-thinking"), + ("moonshot", "kimi-k2.50"), + ("moonshot", None), + ], +) +def test_other_models_keep_temperature(provider, model): + assert not llm_core._moonshot_rejects_custom_temperature(provider, model) + + +@pytest.mark.parametrize( + "url", + [ + "https://api.moonshot.ai/v1/chat/completions", + "https://api.moonshot.cn/v1/chat/completions", + ], +) +def test_moonshot_provider_detection(url): + assert llm_core._detect_provider(url) == "moonshot" + + +def test_moonshot_k2_6_payload_omits_temperature(monkeypatch): + payload = _capture_openai_payload( + monkeypatch, + "kimi-k2.6", + 0.7, + url="https://api.moonshot.ai/v1/chat/completions", + ) + assert "temperature" not in payload + + +def test_self_hosted_kimi_k2_6_payload_keeps_temperature(monkeypatch): + payload = _capture_openai_payload( + monkeypatch, + "kimi-k2.6", + 0.7, + url="http://localhost:8000/v1/chat/completions", + ) + assert payload["temperature"] == 0.7