From ff0f1b3450e73d4067a41d8e2b6446c05ea00a3f Mon Sep 17 00:00:00 2001 From: tanmayraut45 Date: Sun, 28 Jun 2026 05:48:17 +0530 Subject: [PATCH] fix(mcp): retain builtin startup tasks and reap npx probe Keep strong references to builtin MCP startup tasks until completion and kill/reap the npx probe subprocess when cancellation interrupts the probe. Includes focused regression coverage for both lifecycle paths. --- src/builtin_mcp.py | 28 +++++++- tests/test_builtin_mcp_bg_tasks.py | 108 +++++++++++++++++++++++++++++ 2 files changed, 134 insertions(+), 2 deletions(-) create mode 100644 tests/test_builtin_mcp_bg_tasks.py diff --git a/src/builtin_mcp.py b/src/builtin_mcp.py index 93ef0ee61..2a4b748ee 100644 --- a/src/builtin_mcp.py +++ b/src/builtin_mcp.py @@ -89,6 +89,21 @@ _BUILTIN_NPX_SERVERS = { MCP_DISABLED = os.environ.get("ODYSSEUS_DISABLE_MCP", "").lower() in ("1", "true", "yes") +# Strong references to the fire-and-forget startup tasks scheduled below. +# asyncio only keeps weak references to tasks created via create_task, so +# without this the GC can collect a task mid-execution and the server +# registration silently never runs. Mirrors _spawn_bg in routes/chat_helpers.py. +_BG_TASKS: set[asyncio.Task] = set() + + +def _spawn_bg(coro) -> asyncio.Task: + """Schedule a background task and hold a strong reference until it finishes.""" + task = asyncio.create_task(coro) + _BG_TASKS.add(task) + task.add_done_callback(_BG_TASKS.discard) + return task + + async def register_builtin_servers(mcp_manager): """Connect all built-in MCP servers to the manager.""" if MCP_DISABLED: @@ -123,7 +138,7 @@ async def register_builtin_servers(mcp_manager): if not os.path.exists(script_path): logger.warning(f"Built-in MCP server script not found: {script_path}") continue - asyncio.create_task(_connect_python_server(server_id, script_path, name)) + _spawn_bg(_connect_python_server(server_id, script_path, name)) # Register NPX-based servers in the background (they take longer to start) npx_path = _find_npx() @@ -175,7 +190,7 @@ async def register_builtin_servers(mcp_manager): except BaseException as e: logger.warning(f"Built-in NPX server {cfg['name']} error: {type(e).__name__}: {e}") - asyncio.create_task(_start_npx_servers()) + _spawn_bg(_start_npx_servers()) def _npx_package_from_args(args): @@ -233,6 +248,15 @@ async def _is_npx_package_cached(npx_path, package_spec, timeout_s=5): except Exception: pass return False + except asyncio.CancelledError: + # The probe was cancelled (e.g. app shutdown). Reap the child so it + # isn't orphaned, then propagate the cancellation. + try: + proc.kill() + await proc.wait() + except Exception: + pass + raise return proc.returncode == 0 and bool(stdout.strip()) diff --git a/tests/test_builtin_mcp_bg_tasks.py b/tests/test_builtin_mcp_bg_tasks.py new file mode 100644 index 000000000..908c1f339 --- /dev/null +++ b/tests/test_builtin_mcp_bg_tasks.py @@ -0,0 +1,108 @@ +"""Issue #4592 — built-in MCP startup must not leak tasks or subprocesses. + +Two defects in src/builtin_mcp.py: + * `register_builtin_servers` scheduled its python/npx connect coroutines with + a bare `asyncio.create_task(...)` whose return value was dropped. asyncio + keeps only a weak reference to such tasks, so the GC can collect one + mid-flight and the server silently never registers. + * `_is_npx_package_cached` killed its `npx --version` probe subprocess on + `TimeoutError` but not on `CancelledError`, so a cancellation (e.g. app + shutdown) orphaned the child. + +Both are exercised here with the module loaded in isolation (the same loader +the existing npx-cache tests use), so no real servers or npx are involved. +""" + +import asyncio +import importlib.util +import sys +import types +from pathlib import Path + +import pytest + + +ROOT = Path(__file__).resolve().parent.parent + + +def _load_builtin_mcp(monkeypatch): + core = types.ModuleType("core") + core.__path__ = [] + platform_compat = types.ModuleType("core.platform_compat") + platform_compat.IS_WINDOWS = False + platform_compat.which_tool = lambda name: None + monkeypatch.setitem(sys.modules, "core", core) + monkeypatch.setitem(sys.modules, "core.platform_compat", platform_compat) + + spec = importlib.util.spec_from_file_location( + "builtin_mcp_under_test", + ROOT / "src" / "builtin_mcp.py", + ) + module = importlib.util.module_from_spec(spec) + assert spec.loader is not None + spec.loader.exec_module(module) + return module + + +async def test_spawn_bg_holds_strong_ref_until_task_finishes(monkeypatch): + builtin_mcp = _load_builtin_mcp(monkeypatch) + + started = asyncio.Event() + release = asyncio.Event() + + async def work(): + started.set() + await release.wait() + + task = builtin_mcp._spawn_bg(work()) + await started.wait() + # While the task is in flight it must be reachable from the module-level + # set — that strong reference is what keeps the GC from collecting it. + assert task in builtin_mcp._BG_TASKS + + release.set() + await task + await asyncio.sleep(0) # let the done-callback run + # Once finished it is discarded so the set doesn't grow without bound. + assert task not in builtin_mcp._BG_TASKS + + +async def test_npx_probe_reaps_subprocess_on_cancel(monkeypatch): + builtin_mcp = _load_builtin_mcp(monkeypatch) + + # Force the code past the fast cache hit so it spawns the probe subprocess. + monkeypatch.setattr(builtin_mcp, "_is_package_in_npx_cache", lambda spec: False) + + state = {"killed": False, "waited": False} + started = asyncio.Event() + + class FakeProc: + returncode = None + + async def communicate(self): + started.set() + await asyncio.sleep(3600) # block until the probe is cancelled + + def kill(self): + state["killed"] = True + + async def wait(self): + state["waited"] = True + + async def fake_create(*args, **kwargs): + return FakeProc() + + monkeypatch.setattr(builtin_mcp.asyncio, "create_subprocess_exec", fake_create) + + task = asyncio.create_task( + builtin_mcp._is_npx_package_cached("npx", "some-pkg@1.0.0", timeout_s=3600) + ) + await started.wait() + task.cancel() + + with pytest.raises(asyncio.CancelledError): + await task + + # The child was killed and reaped rather than orphaned. + assert state["killed"] is True + assert state["waited"] is True