From e7c1d758846264458064e98317b681ab3f10c0f2 Mon Sep 17 00:00:00 2001 From: Ocean Bennett <204957658+undergroundrap@users.noreply.github.com> Date: Mon, 8 Jun 2026 19:09:02 -0400 Subject: [PATCH] fix(models): query v1 models for llama-server endpoints (#3380) * fix(models): query v1 models for llama-server endpoints * test(models): accept owner kwargs in llama-server regression --- routes/model_routes.py | 2 +- src/endpoint_resolver.py | 2 +- src/llm_core.py | 4 +- src/model_context.py | 4 +- tests/test_llama_server_models_url.py | 58 +++++++++++++++++++++++++++ 5 files changed, 66 insertions(+), 4 deletions(-) create mode 100644 tests/test_llama_server_models_url.py diff --git a/routes/model_routes.py b/routes/model_routes.py index a54f4d302..995705d75 100644 --- a/routes/model_routes.py +++ b/routes/model_routes.py @@ -857,7 +857,7 @@ def _ping_endpoint(base_url: str, api_key: str = None, timeout: float = 1.5) -> and 400 <= result["status_code"] < 500 and result["status_code"] not in (401, 403) ): - models_url = base.rstrip("/") + "/models" + models_url = build_models_url(base) try: r2 = httpx.get(models_url, headers=headers, timeout=timeout, verify=llm_verify()) result2 = _result_from_response(r2) diff --git a/src/endpoint_resolver.py b/src/endpoint_resolver.py index 1ae7ace84..0a3063638 100644 --- a/src/endpoint_resolver.py +++ b/src/endpoint_resolver.py @@ -184,7 +184,7 @@ def build_chat_url(base: str) -> str: def build_models_url(base: str) -> Optional[str]: """Return the provider-specific model-list endpoint URL for a base.""" - base = resolve_url(base) + base = normalize_base(resolve_url(base)) provider = _detect_provider(base) if provider == "anthropic": return _anthropic_api_root(base) + "/v1/models" diff --git a/src/llm_core.py b/src/llm_core.py index 2fbfc8178..9ed499c61 100644 --- a/src/llm_core.py +++ b/src/llm_core.py @@ -1042,7 +1042,9 @@ def list_model_ids( if provider == "ollama": models_url = _ollama_api_root(base_chat_url) + "/tags" else: - models_url = base_chat_url.replace("/chat/completions", "/models") + from src.endpoint_resolver import build_models_url + + models_url = build_models_url(base_chat_url) r = httpx.get(models_url, headers=h, timeout=timeout) r.raise_for_status() data = r.json() diff --git a/src/model_context.py b/src/model_context.py index c71d76fcf..a2ce9f638 100644 --- a/src/model_context.py +++ b/src/model_context.py @@ -297,7 +297,9 @@ def _query_context_length(endpoint_url: str, model: str) -> int: logger.info(f"Using known context window for {model}: {known}") return known or DEFAULT_CONTEXT - models_url = endpoint_url.replace("/chat/completions", "/models") + from src.endpoint_resolver import build_models_url + + models_url = build_models_url(endpoint_url) try: r = httpx.get(models_url, timeout=REQUEST_TIMEOUT) if r.is_success: diff --git a/tests/test_llama_server_models_url.py b/tests/test_llama_server_models_url.py new file mode 100644 index 000000000..36c49714a --- /dev/null +++ b/tests/test_llama_server_models_url.py @@ -0,0 +1,58 @@ +"""Regression coverage for llama-server style /v1 model-list endpoints (#3330).""" + +import httpx + +from src import endpoint_resolver, llm_core, model_context + + +def test_build_models_url_accepts_v1_base_and_chat_url(monkeypatch): + monkeypatch.setattr(endpoint_resolver, "resolve_url", lambda url: url) + + assert ( + endpoint_resolver.build_models_url("http://127.0.0.1:8080/v1") + == "http://127.0.0.1:8080/v1/models" + ) + assert ( + endpoint_resolver.build_models_url("http://127.0.0.1:8080/v1/chat/completions") + == "http://127.0.0.1:8080/v1/models" + ) + + +def test_llm_core_list_model_ids_queries_models_for_v1_base(monkeypatch): + monkeypatch.setattr(endpoint_resolver, "resolve_url", lambda url: url) + monkeypatch.setattr(llm_core, "_configured_cached_model_ids", lambda url, **kwargs: []) + seen = [] + + def fake_get(url, headers=None, timeout=None): + seen.append(url) + request = httpx.Request("GET", url) + return httpx.Response(200, json={"data": [{"id": "qwen3"}]}, request=request) + + monkeypatch.setattr(llm_core.httpx, "get", fake_get) + + assert llm_core.list_model_ids("http://127.0.0.1:8080/v1", timeout=1) == ["qwen3"] + assert seen == ["http://127.0.0.1:8080/v1/models"] + + +def test_model_context_queries_models_for_v1_base(monkeypatch): + monkeypatch.setattr(endpoint_resolver, "resolve_url", lambda url: url) + seen = [] + + def fake_get(url, timeout=None): + seen.append(url) + request = httpx.Request("GET", url) + if url.endswith("/slots"): + return httpx.Response(404, request=request) + return httpx.Response( + 200, + json={"data": [{"id": "qwen3", "context_length": 32768}]}, + request=request, + ) + + monkeypatch.setattr(model_context.httpx, "get", fake_get) + + assert model_context._query_context_length("http://127.0.0.1:8080/v1", "qwen3") == 32768 + assert seen == [ + "http://127.0.0.1:8080/slots", + "http://127.0.0.1:8080/v1/models", + ]