From bd9149f79a3777d5211c3afbc707d94890faba5a Mon Sep 17 00:00:00 2001
From: aubrey <kyuhex@gmail.com>
Date: Tue, 23 Jun 2026 05:28:17 -0300
Subject: [PATCH] fix(llm): detect mistral.ai provider and support
 reasoning_effort (#4698)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix(llm): detect mistral.ai provider and support reasoning_effort

Four coupled bugs broke Mistral thinking model support:

1. _detect_provider() had no mistral.ai host check, so all Mistral
   endpoints fell through to the generic 'openai' provider string.
   _provider_display_name() correctly identified them as 'Mistral',
   making any 'if provider == "Mistral"' check elsewhere dead code.

2. reasoning_effort parameter was never sent in the request payload,
   so Mistral never activated thinking mode even when the user
   configured a thinking-capable model (mistral-small-latest,
   mistral-medium-latest, magistral-*).

3. Mistral returns content as a typed array
   ([{"type":"thinking",...},{"type":"text",...}]) when
   reasoning is on, not as a plain string. Both the streaming and
   non-streaming parsers expected strings and silently dropped the
   thinking content.

4. _THINKING_MODEL_PATTERNS didn't include magistral or mistral-*
   model prefixes, so the frontend wouldn't tag reasoning output
   as thinking even after the above were fixed.

Fix:
- Add mistral.ai to _detect_provider() host checks
- Add a _normalize_mistral_content() helper that splits the typed
  array into (text, thinking) strings
- Inject payload["reasoning_effort"] = "high" when provider is
  Mistral and _supports_thinking(model) is true, in both stream_llm
  and llm_call_async payload construction
- Wire the normalizer into both response parsers
- Extend _THINKING_MODEL_PATTERNS to include magistral,
  mistral-small, mistral-medium, mistral-large

Tested on Docker install with mistral-small-latest +
reasoning_effort=high. Reasoning streams correctly into the
thinking panel after the fix.

Fixes #4678

* fix(llm): address review — lowercase provider id, configurable effort, tests

Addresses vdmkenny's review on PR #4698:

1. Removed duplicate 'if provider == "mistral"' block in stream_llm
   — two back-to-back copies, one was dead-redundant.

2. Dropped personal-context comment ('free-tier limits are generous
   for this user') and made reasoning_effort configurable via env var
   ODYSSEUS_MISTRAL_REASONING_EFFORT (high / medium / low / none).
   Default remains 'high' for backward compat with the tested behavior.

3. Recased provider id from 'Mistral' to 'mistral' to match the
   lowercase convention used by every other provider id in the file
   (openai, anthropic, ollama, copilot, ...). _provider_display_name()
   still returns the Title-Case 'Mistral' for UI labels — only the
   runtime id used in 'if provider == ...' checks was recased.

4. Added tests/test_llm_core_mistral_content.py with 13 tests pinning
   _normalize_mistral_content()'s contract: string passthrough, the
   Mistral array format (thinking + text blocks), and edge cases
   (empty, garbage, None, wrong types, missing fields, string-vs-array
   inner thinking field).

Also fixed a gap the review didn't catch: the non-streaming paths
(llm_call sync + llm_call_async) were missing the reasoning_effort
injection entirely. Added the same injection to both, so Deep Research
and agent tool calls also activate Mistral thinking.

All 13 new tests pass. Existing reasoning/streaming/ollama-thinking
tests still pass (38 tests, no regressions).

Fixes #4678
---
 src/llm_core.py                        |  72 +++++++++++-
 tests/test_llm_core_mistral_content.py | 156 +++++++++++++++++++++++++
 2 files changed, 226 insertions(+), 2 deletions(-)
 create mode 100644 tests/test_llm_core_mistral_content.py

diff --git a/src/llm_core.py b/src/llm_core.py
index 20a4b544c..2178f894f 100644
--- a/src/llm_core.py
+++ b/src/llm_core.py
@@ -618,6 +618,8 @@ def _detect_provider(url: str) -> str:
     from src.copilot import is_copilot_base
     if is_copilot_base(url):
         return "copilot"
+    if _host_match(url, "mistral.ai"):
+        return "mistral"
     return "openai"
 
 
@@ -906,10 +908,17 @@ def _anthropic_rejects_temperature(model: str) -> bool:
         return False
     return (int(match.group(1)), int(match.group(2))) >= (4, 7)
 
+# Reasoning effort level sent to Mistral thinking-capable models. Mistral's
+# API accepts "high", "medium", "low", "none" — see
+# https://docs.mistral.ai/capabilities/reasoning/. Override via env var
+# ODYSSEUS_MISTRAL_REASONING_EFFORT (e.g. set to "medium" for cheaper chat).
+_MISTRAL_REASONING_EFFORT = os.getenv("ODYSSEUS_MISTRAL_REASONING_EFFORT", "high")
+
 # Models that support structured thinking — may output </think> without opening tag
 _THINKING_MODEL_PATTERNS = (
     "qwen3", "qwq", "deepseek-r1", "deepseek-reasoner", "minimax",
     "m2-reap", "gemma", "stepfun", "step-3", "step3",
+    "magistral", "mistral-small", "mistral-medium",
 )
 
 def _supports_thinking(model: str) -> bool:
@@ -919,6 +928,38 @@ def _supports_thinking(model: str) -> bool:
     m = model.lower()
     return any(p in m for p in _THINKING_MODEL_PATTERNS)
 
+def _normalize_mistral_content(content):
+    """Mistral returns content as a structured array when reasoning is on:
+        [{"type": "thinking", "thinking": [{"type": "text", "text": "..."}], "closed": true},
+         {"type": "text", "text": "...final answer..."}]
+    Convert to (text, thinking) tuple of plain strings. Pass through strings
+    unchanged so non-Mistral OpenAI-compat endpoints are unaffected.
+    """
+    if isinstance(content, str):
+        return content, ""
+    if not isinstance(content, list):
+        return "", ""
+    text_parts = []
+    thinking_parts = []
+    for block in content:
+        if not isinstance(block, dict):
+            continue
+        btype = block.get("type")
+        if btype == "text":
+            t = block.get("text", "")
+            if t:
+                text_parts.append(t)
+        elif btype == "thinking":
+            inner = block.get("thinking", [])
+            if isinstance(inner, list):
+                for tb in inner:
+                    if isinstance(tb, dict) and tb.get("text"):
+                        thinking_parts.append(tb["text"])
+            elif isinstance(inner, str):
+                thinking_parts.append(inner)
+    return "".join(text_parts), "".join(thinking_parts)
+
+
 def _convert_openai_content_to_anthropic(content):
     """Convert OpenAI multimodal content blocks to Anthropic format.
 
@@ -1441,6 +1482,8 @@ def llm_call(url: str, model: str, messages: List[Dict], temperature: float = LL
         if max_tokens and max_tokens > 0:
             tok_key = "max_completion_tokens" if _uses_max_completion_tokens(model) else "max_tokens"
             payload[tok_key] = max_tokens
+        if provider == "mistral" and _supports_thinking(model):
+            payload["reasoning_effort"] = _MISTRAL_REASONING_EFFORT
     try:
         note_model_activity(target_url, model)
         r = httpx_post_kimi_aware(target_url, h, json=payload, timeout=timeout)
@@ -1456,7 +1499,16 @@ def llm_call(url: str, model: str, messages: List[Dict], temperature: float = LL
             response = _parse_ollama_response(data)
         else:
             msg = data["choices"][0]["message"]
-            response = msg.get("content") or msg.get("reasoning_content") or ""
+            content = msg.get("content")
+            if isinstance(content, list):
+                # Mistral structured content — extract thinking + text
+                text_part, thinking_part = _normalize_mistral_content(content)
+                if thinking_part:
+                    response = thinking_part + "\n\n" + (text_part or "")
+                else:
+                    response = text_part or msg.get("reasoning_content") or ""
+            else:
+                response = content or msg.get("reasoning_content") or ""
         _set_cached_response(cache_key, response)
         return response
     except Exception:
@@ -1638,6 +1690,8 @@ async def llm_call_async(
         # Suppress thinking for qwen3/gemma4 on Ollama /v1 — same as stream_llm.
         if _is_ollama_openai_compat_url(url) and _supports_thinking(model):
             payload["think"] = False
+        if provider == "mistral" and _supports_thinking(model):
+            payload["reasoning_effort"] = _MISTRAL_REASONING_EFFORT
         _apply_local_cache_affinity(payload, url, session_id)
 
     if _is_host_dead(target_url):
@@ -1756,6 +1810,12 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
             payload[tok_key] = max_tokens
         if tools:
             payload["tools"] = tools
+        # Mistral thinking-capable models — send reasoning_effort so Mistral
+        # activates thinking mode and returns structured reasoning_content.
+        # Effort level is configurable via ODYSSEUS_MISTRAL_REASONING_EFFORT
+        # (high / medium / low / none); default "high".
+        if provider == "mistral" and _supports_thinking(model):
+            payload["reasoning_effort"] = _MISTRAL_REASONING_EFFORT
         # For Ollama's OpenAI-compat /v1 endpoint with thinking models (qwen3,
         # gemma4, etc.), suppress thinking so tool calls aren't swallowed inside
         # <think> blocks. Ollama /v1 accepts "think": false as a top-level param.
@@ -2134,9 +2194,17 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
                                         # Text content
                                         # Reasoning tokens (VLLM --reasoning-parser, e.g. Qwen3/DeepSeek-R1, Nemotron). vLLM 0.20.2 / NIM emit the field as `reasoning`; older builds use `reasoning_content`. Some OpenAI-compatible Ollama builds use `thinking`.
                                         reasoning = delta.get("reasoning_content") or delta.get("reasoning") or delta.get("thinking") or ""
+                                        content = delta.get("content") or ""
+                                        # Mistral structured content: content is a list of typed blocks
+                                        # ({"type": "thinking", ...}, {"type": "text", ...}). Split into
+                                        # reasoning + text so thinking streams into the thinking panel.
+                                        if isinstance(content, list):
+                                            text_part, thinking_part = _normalize_mistral_content(content)
+                                            if thinking_part:
+                                                reasoning = (reasoning + thinking_part) if reasoning else thinking_part
+                                            content = text_part
                                         if reasoning:
                                             yield _stream_delta_event(reasoning, thinking=True)
-                                        content = delta.get("content") or ""
                                         if content:
                                             content = re.sub(r"<mm:think(\s+[^>]*)?>", r"<think\1>", content, flags=re.IGNORECASE)
                                             content = re.sub(r"</mm:think>", "</think>", content, flags=re.IGNORECASE)
diff --git a/tests/test_llm_core_mistral_content.py b/tests/test_llm_core_mistral_content.py
new file mode 100644
index 000000000..ee09011b5
--- /dev/null
+++ b/tests/test_llm_core_mistral_content.py
@@ -0,0 +1,156 @@
+"""Tests for _normalize_mistral_content() — Mistral's structured content parser.
+
+Mistral's chat completions API returns content as a typed array when reasoning
+is enabled, instead of the plain string most OpenAI-compat servers use:
+
+    "content": [
+      {"type": "thinking", "thinking": [{"type": "text", "text": "..."}], "closed": true},
+      {"type": "text", "text": "..."}
+    ]
+
+_normalize_mistral_content() splits that into (text, thinking) plain strings.
+The function is called from three sites:
+  - llm_call (sync, non-streaming response parser)
+  - llm_call_async (async, non-streaming response parser)
+  - stream_llm (streaming delta parser)
+
+These tests pin the contract: string passthrough, the array shape, and the
+edge cases (empty, garbage, missing fields) so a refactor doesn't silently
+drop thinking content or break non-Mistral providers.
+"""
+from src.llm_core import _normalize_mistral_content
+
+
+def test_string_passthrough_returns_text_with_empty_thinking():
+    """Plain string content (the common case) passes through unchanged."""
+    text, thinking = _normalize_mistral_content("hello world")
+    assert text == "hello world"
+    assert thinking == ""
+
+
+def test_empty_string_passthrough():
+    text, thinking = _normalize_mistral_content("")
+    assert text == ""
+    assert thinking == ""
+
+
+def test_array_with_thinking_and_text_blocks():
+    """Mistral's documented format: thinking block + text block."""
+    content = [
+        {
+            "type": "thinking",
+            "thinking": [{"type": "text", "text": "Let me work through this..."}],
+            "closed": True,
+        },
+        {"type": "text", "text": "The answer is 42."},
+    ]
+    text, thinking = _normalize_mistral_content(content)
+    assert text == "The answer is 42."
+    assert thinking == "Let me work through this..."
+
+
+def test_array_with_only_thinking_block():
+    """Streaming deltas often contain only a thinking fragment (no text block yet)."""
+    content = [
+        {
+            "type": "thinking",
+            "thinking": [{"type": "text", "text": "Okay, let's"}],
+            "closed": True,
+        }
+    ]
+    text, thinking = _normalize_mistral_content(content)
+    assert text == ""
+    assert thinking == "Okay, let's"
+
+
+def test_array_with_only_text_block():
+    """Final answer delta — only the text block, no thinking."""
+    content = [{"type": "text", "text": "Final answer."}]
+    text, thinking = _normalize_mistral_content(content)
+    assert text == "Final answer."
+    assert thinking == ""
+
+
+def test_array_concatenates_multiple_text_blocks():
+    """Multiple text blocks are concatenated in order."""
+    content = [
+        {"type": "text", "text": "part 1 "},
+        {"type": "text", "text": "part 2"},
+    ]
+    text, thinking = _normalize_mistral_content(content)
+    assert text == "part 1 part 2"
+
+
+def test_array_concatenates_multiple_thinking_fragments():
+    """Multiple thinking sub-blocks are concatenated in order."""
+    content = [
+        {
+            "type": "thinking",
+            "thinking": [
+                {"type": "text", "text": "first "},
+                {"type": "text", "text": "second"},
+            ],
+            "closed": True,
+        }
+    ]
+    text, thinking = _normalize_mistral_content(content)
+    assert text == ""
+    assert thinking == "first second"
+
+
+def test_empty_array_returns_empty_strings():
+    text, thinking = _normalize_mistral_content([])
+    assert text == ""
+    assert thinking == ""
+
+
+def test_array_with_garbage_entries_skips_them():
+    """Non-dict entries, missing type, missing text — all silently skipped."""
+    content = [
+        "not a dict",
+        None,
+        {"type": "unknown_type", "text": "should be ignored"},
+        {"type": "text"},  # missing text key
+        {"type": "thinking"},  # missing thinking key
+        {"type": "text", "text": "valid text"},
+    ]
+    text, thinking = _normalize_mistral_content(content)
+    assert text == "valid text"
+    assert thinking == ""
+
+
+def test_none_returns_empty_strings():
+    """Defensive: None content (server bug or schema drift) doesn't crash."""
+    text, thinking = _normalize_mistral_content(None)
+    assert text == ""
+    assert thinking == ""
+
+
+def test_int_returns_empty_strings():
+    """Defensive: wrong-typed content doesn't crash."""
+    text, thinking = _normalize_mistral_content(42)
+    assert text == ""
+    assert thinking == ""
+
+
+def test_thinking_block_with_string_inner():
+    """Some Mistral API versions may use a string instead of an array for
+    the inner 'thinking' field. Accept both shapes."""
+    content = [
+        {"type": "thinking", "thinking": "inline string thinking"},
+        {"type": "text", "text": "answer"},
+    ]
+    text, thinking = _normalize_mistral_content(content)
+    assert text == "answer"
+    assert thinking == "inline string thinking"
+
+
+def test_thinking_block_with_empty_text_field():
+    """Empty text fields don't pollute the output."""
+    content = [
+        {"type": "thinking", "thinking": [{"type": "text", "text": ""}], "closed": True},
+        {"type": "text", "text": ""},
+    ]
+    text, thinking = _normalize_mistral_content(content)
+    assert text == ""
+    assert thinking == ""