fix(llm): omit temperature for Kimi K2.5 and K2.6 (#3960)

This commit is contained in:
Josh Patra
2026-06-15 07:29:22 -04:00
committed by GitHub
parent 4ee5ed4dce
commit f5d3e5098a
2 changed files with 92 additions and 5 deletions
+27 -3
View File
@@ -605,6 +605,8 @@ def _detect_provider(url: str) -> str:
return "groq"
if _host_match(url, "nvidia.com"):
return "nvidia"
if _host_match(url, "moonshot.ai") or _host_match(url, "moonshot.cn"):
return "moonshot"
from src.chatgpt_subscription import is_chatgpt_subscription_base
if is_chatgpt_subscription_base(url):
return "chatgpt-subscription"
@@ -856,6 +858,28 @@ def _restricts_temperature(model: str) -> bool:
m = model.lower()
return any(m.startswith(p) or f"/{p}" in m for p in _FIXED_TEMPERATURE_MODELS)
# The official Moonshot API fixes temperature at 1.0 in thinking mode and 0.6
# when thinking is explicitly disabled for Kimi K2.5/K2.6. Any other explicit
# value returns HTTP 400. Odysseus does not currently send the `thinking` mode
# control, so omit temperature and let Moonshot use its default thinking mode.
# Keep the gate provider-specific: self-hosted Kimi deployments may accept
# custom sampling values, and older Moonshot models have different defaults.
def _moonshot_rejects_custom_temperature(provider: str, model: str) -> bool:
"""Check if the official Moonshot API fixes temperature for this model."""
if provider != "moonshot" or not isinstance(model, str):
return False
model_id = model.lower().rsplit("/", 1)[-1]
return bool(re.match(r"^kimi-k2\.(?:5|6)(?:$|[-_:])", model_id))
def _omit_temperature(provider: str, model: str) -> bool:
"""Check if a request should use the provider's default temperature."""
return _restricts_temperature(model) or _moonshot_rejects_custom_temperature(
provider, model
)
# Anthropic removed the sampling parameters (temperature, top_p, top_k) starting
# with Claude Opus 4.7. On Opus 4.7 and later, sending `temperature` at all —
# even 0.0 — returns HTTP 400. Earlier Claude models (Opus 4.6 and below, every
@@ -1404,7 +1428,7 @@ def llm_call(url: str, model: str, messages: List[Dict], temperature: float = LL
"messages": messages_copy,
"temperature": temperature,
}
if _restricts_temperature(model):
if _omit_temperature(provider, model):
payload.pop("temperature", None)
if max_tokens and max_tokens > 0:
tok_key = "max_completion_tokens" if _uses_max_completion_tokens(model) else "max_tokens"
@@ -1598,7 +1622,7 @@ async def llm_call_async(
"messages": messages_copy,
"temperature": temperature,
}
if _restricts_temperature(model):
if _omit_temperature(provider, model):
payload.pop("temperature", None)
if max_tokens and max_tokens > 0:
tok_key = "max_completion_tokens" if _uses_max_completion_tokens(model) else "max_tokens"
@@ -1715,7 +1739,7 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
"temperature": temperature,
"stream": True,
}
if _restricts_temperature(model):
if _omit_temperature(provider, model):
payload.pop("temperature", None)
if provider not in {"openrouter", "groq"}:
payload["stream_options"] = {"include_usage": True}