fix(llm): normalize OpenAI-compatible chat URLs

Normalize OpenAI-compatible chat URL shapes so base /v1 endpoints route to /v1/chat/completions while already-full chat endpoints remain idempotent. Preserve native local Ollama routing for bare localhost:11434 endpoints, keep localhost:11434/v1 as OpenAI-compatible, and add focused regression coverage for provider detection, chat target URLs, and model listing from /v1. Part of #541.
2026-06-28 23:52:09 -04:00 · 2026-06-28 16:30:15 +02:00
parent bb2148db73
commit 927b1f7ecf
2 changed files with 97 additions and 3 deletions
@@ -345,6 +345,18 @@ def _normalize_ollama_url(url: str) -> str:
    return base.rstrip("/") + "/chat"
 def _normalize_openai_chat_url(url: str) -> str:
    """Ensure an OpenAI-compatible base URL points at /chat/completions."""
    base = (url or "").strip().rstrip("/")
    if not base:
        return base
    if base.endswith("/chat/completions") or base.endswith("/completions"):
        return base
    if base.endswith("/models"):
        base = base[: -len("/models")].rstrip("/")
    return base + "/chat/completions"
 def _ollama_normalize_messages(messages: List[Dict]) -> List[Dict]:
    """Adapt Odysseus' canonical OpenAI-style messages to native Ollama /api/chat.
@@ -1563,7 +1575,7 @@ def llm_call(url: str, model: str, messages: List[Dict], temperature: float = LL
            stream=False, num_ctx=get_context_length(url, model),
        )
    else:
-        target_url = url
+        target_url = _normalize_openai_chat_url(url)
        if provider == "copilot":
            from src.copilot import apply_request_headers
            apply_request_headers(h, messages_copy)
@@ -1767,7 +1779,7 @@ async def llm_call_async(
            stream=False, num_ctx=get_context_length(url, model),
        )
    else:
-        target_url = url
+        target_url = _normalize_openai_chat_url(url)
        h = _provider_headers(provider, headers)
        if provider == "copilot":
            from src.copilot import apply_request_headers
@@ -1889,7 +1901,7 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
        h = _provider_headers(provider, headers)
        payload = _build_chatgpt_responses_payload(model, messages_copy, temperature, max_tokens, stream=True)
    else:
-        target_url = url
+        target_url = _normalize_openai_chat_url(url)
        payload = {
            "model": model,
            "messages": messages_copy,
@@ -9,6 +9,12 @@ def test_detects_ollama_cloud_native_provider():
    assert llm_core._detect_provider("https://ollama.com/api/chat") == "ollama"
 def test_detects_bare_local_ollama_as_native_provider():
    assert llm_core._detect_provider("http://localhost:11434") == "ollama"
    assert llm_core._detect_provider("http://127.0.0.1:11434/") == "ollama"
    assert llm_core._detect_provider("http://localhost:11434/v1") == "openai"
 def test_llm_call_posts_native_ollama_payload(monkeypatch):
    seen = {}
@@ -43,6 +49,82 @@ def test_llm_call_posts_native_ollama_payload(monkeypatch):
    assert seen["json"]["options"] == {"temperature": 0.2, "num_predict": 7}
 def test_llm_call_posts_bare_local_ollama_to_native_api(monkeypatch):
    seen = {}
    def fake_post(url, headers=None, json=None, timeout=None):
        seen["url"] = url
        seen["json"] = json
        request = httpx.Request("POST", url)
        return httpx.Response(
            200,
            request=request,
            json={"message": {"content": "OK"}, "done": True},
        )
    monkeypatch.setattr(llm_core.httpx, "post", fake_post)
    result = llm_core.llm_call(
        "http://localhost:11434",
        "llama3.2",
        [{"role": "user", "content": "Say OK"}],
    )
    assert result == "OK"
    assert seen["url"] == "http://localhost:11434/api/chat"
    assert seen["json"]["stream"] is False
 def test_openai_compatible_chat_url_shapes(monkeypatch):
    seen = []
    def fake_post(url, headers=None, json=None, timeout=None):
        seen.append(url)
        request = httpx.Request("POST", url)
        return httpx.Response(
            200,
            request=request,
            json={"choices": [{"message": {"content": "OK"}}]},
        )
    monkeypatch.setattr(llm_core.httpx, "post", fake_post)
    llm_core._response_cache.clear()
    cases = [
        ("http://localhost:11434/v1", "http://localhost:11434/v1/chat/completions"),
        (
            "http://localhost:11434/v1/chat/completions",
            "http://localhost:11434/v1/chat/completions",
        ),
    ]
    for i, (base_url, expected_url) in enumerate(cases):
        result = llm_core.llm_call(
            base_url,
            f"openai-compatible-{i}",
            [{"role": "user", "content": f"Say OK {i}"}],
        )
        assert result == "OK"
        assert seen[-1] == expected_url
 def test_list_model_ids_from_openai_compatible_v1(monkeypatch):
    seen = {}
    def fake_get(url, headers=None, timeout=None):
        seen["url"] = url
        request = httpx.Request("GET", url)
        return httpx.Response(
            200,
            request=request,
            json={"data": [{"id": "qwen2.5-coder:7b"}]},
        )
    monkeypatch.setattr(llm_core.httpx, "get", fake_get)
    assert llm_core.list_model_ids("http://localhost:11434/v1") == ["qwen2.5-coder:7b"]
    assert seen["url"] == "http://localhost:11434/v1/models"
 # ---------------------------------------------------------------------------
 # Tool-call argument serialization for native Ollama
 #