feat(discovery): detect llama.cpp servers and label local providers (#4729)

* feat(discovery): detect llama.cpp servers and label local providers Scan port 8080 (llama-server) and 11435 (APFEL) during discovery, fingerprint llama.cpp via its native /props endpoint, and label well-known local serving ports (8080 llama.cpp, 8000 vLLM, 1234 LM Studio, 11434 Ollama) consistently in both the Python provider helper and the JS endpoint UI. Adds a llama.cpp hint to the /setup slash command. * fix(discovery): don't infer the serving tool from the port alone Per review: vLLM, SGLang, llama.cpp and plain OpenAI-compatible servers all share 8000/8080, so labeling by port mislabels real setups (a vLLM box on 8080 shown as llama.cpp). Drop the port->tool assertions from _provider_label and providerLabel; the authoritative signal is the /props fingerprint done during discovery, which is unchanged. Loopback now reads a neutral 'local endpoint' / 'Local'. Tests updated to assert the neutral labels.
2026-06-28 23:52:09 -04:00 · 2026-06-23 23:39:56 +02:00
parent 72c0bde8a9
commit e0ccf250a4
9 changed files with 330 additions and 15 deletions
@@ -777,10 +777,17 @@ def _provider_label(url: str) -> str:
            pass
    if _is_ollama_native_url(url): return "Ollama"
    try:
-        host = (urlparse(url).hostname or "").lower()
+        _parsed_local = urlparse(url)
+        host = (_parsed_local.hostname or "").lower()
+        port = _parsed_local.port
    except Exception:
        return "provider"
    if host in {"localhost", "127.0.0.1", "::1", "0.0.0.0"}:
+        # A port alone is not authoritative: vLLM, SGLang, llama.cpp and plain
+        # OpenAI-compatible servers all routinely share 8000/8080, so naming the
+        # serving tool from the port here would mislabel real setups. The tool is
+        # identified by probing llama-server's native /props endpoint during
+        # discovery (see ModelDiscovery._fingerprint_provider); this stays neutral.
        return "local endpoint"
    return host or "provider"

@@ -163,6 +163,21 @@ class ModelDiscovery:
                    return "lmstudio"
        except Exception:
            pass
+        # llama.cpp's llama-server exposes a native /props endpoint (no /v1 prefix)
+        # describing the loaded model, slots, and chat template — distinct from
+        # LM Studio (/api/v1/models) and vLLM (/version, /metrics).
+        try:
+            r = httpx.get(f"http://{host}:{port}/props", timeout=1.5)
+            if r.is_success:
+                props = r.json() or {}
+                if isinstance(props, dict) and (
+                    "default_generation_settings" in props
+                    or "total_slots" in props
+                    or "chat_template" in props
+                ):
+                    return "llamacpp"
+        except Exception:
+            pass
        return None

    def _check_port(self, host: str, port: int) -> Optional[Dict[str, Any]]:
@@ -194,10 +209,11 @@ class ModelDiscovery:

        logger.info(f"Scanning {len(hosts)} hosts for models: {hosts}")

-        # Well-known ports: 8000-8020 (vLLM, llama.cpp, SGLang, Cookbook),
-        # 1234 (LM Studio), 11434 (Ollama), 11435 for APFEL as its default port is
-        # occupied by Ollama. The env vars can add more ports which will be merged in.
-        ports = list(range(8000, 8021)) + [1234, 11434, 11435]
+        # Well-known ports: 8000-8020 (vLLM, SGLang, Cookbook), 8080 (llama.cpp /
+        # llama-server default), 1234 (LM Studio), 11434 (Ollama), 11435 for APFEL
+        # as its default port is occupied by Ollama. The env vars can add more
+        # ports which will be merged in.
+        ports = list(range(8000, 8021)) + [8080, 1234, 11434, 11435]
        ports += [p for p in sorted(self._extra_ports) if p not in ports]
        targets = [(h, p) for h in hosts for p in ports]