From 39a802bea23d56c309109e9f33e6a8cdcecbb65c Mon Sep 17 00:00:00 2001 From: Michael <52305679+michaelxer@users.noreply.github.com> Date: Fri, 19 Jun 2026 03:02:29 +0700 Subject: [PATCH] fix(tools): prune skipped dirs before descending in glob tool (#4538) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(tools): prune skipped dirs before descending in glob tool GlobTool used pathlib.Path.rglob which descends into every directory (including node_modules, .git, dist, etc.) and filters AFTER the walk. On repos with large junk directories this causes the glob tool to hang for minutes. Replace rglob with os.walk that prunes _CODENAV_SKIP_DIRS before descending — matching the approach GrepTool already uses. Also add a fast path for literal patterns (no wildcards → direct path lookup). Fixes #4493 * fix(tools): use regex glob matching to fix * semantics and literal fallback Replace fnmatch with _glob_to_regex so that * stays within a single path segment (matching pathlib/rglob semantics) and **/ spans zero or more directories. Literal patterns now fall through to os.walk when the direct path lookup misses, so e.g. 'foo.py' still finds files at any depth. Add tests for: - bare literal matching in subdirectories - multi-segment single-star patterns (sub/*.txt) - * not crossing / boundaries - ** matching at arbitrary depth Closes #4493 --------- Co-authored-by: michaelxer --- src/agent_tools/filesystem_tools.py | 67 +++++++++++++++++++++++------ tests/test_code_nav_tools.py | 34 +++++++++++++++ 2 files changed, 88 insertions(+), 13 deletions(-) diff --git a/src/agent_tools/filesystem_tools.py b/src/agent_tools/filesystem_tools.py index 7ba22161c..a55944ae5 100644 --- a/src/agent_tools/filesystem_tools.py +++ b/src/agent_tools/filesystem_tools.py @@ -1,6 +1,7 @@ import asyncio import json import os +import re import difflib import fnmatch import shutil @@ -16,6 +17,31 @@ _CODENAV_SKIP_DIRS = frozenset({ _CODENAV_MAX_HITS = 200 _CODENAV_MAX_LINE = 400 + +def _glob_to_regex(pat: str) -> "re.Pattern": + """Translate a forward-slash glob (**, *, ?) into a compiled regex. + `**/` matches zero or more complete directories. + `*` matches within a single path segment (does not cross /). + """ + i, n, out = 0, len(pat), [] + while i < n: + if pat[i : i + 3] == "**/": + out.append("(?:[^/]+/)*") + i += 3 + elif pat[i : i + 2] == "**": + out.append(".*") + i += 2 + elif pat[i] == "*": + out.append("[^/]*") + i += 1 + elif pat[i] == "?": + out.append("[^/]") + i += 1 + else: + out.append(re.escape(pat[i])) + i += 1 + return re.compile("".join(out)) + def _unified_diff(old: str, new: str, path: str) -> Optional[Dict[str, Any]]: if old == new: return None @@ -259,23 +285,38 @@ class GlobTool: return {"error": f"glob: {e}", "exit_code": 1} def _glob(): - from pathlib import Path - base = Path(root) - if not base.is_dir(): + base = os.path.abspath(root) + if not os.path.isdir(base): return None, f"glob: {root}: not a directory" + norm_pat = pattern.replace("\\", "/") + # Fast path: literal pattern (no wildcards) → direct path lookup. + if not any(c in norm_pat for c in "*?["): + cand = os.path.normpath(os.path.join(base, norm_pat)) + if os.path.exists(cand): + return [cand], None + # Literal not at exact path — fall through to walk so + # e.g. "foo.py" still matches at any depth (like rglob). + # Compile glob to regex: * stays within one segment, **/ spans dirs. + regex = _glob_to_regex(norm_pat) matched = [] + cap = _CODENAV_MAX_HITS * 5 try: - for p in base.rglob(pattern): - if set(p.relative_to(base).parts) & _CODENAV_SKIP_DIRS: - continue - try: - mtime = p.stat().st_mtime - except OSError: - mtime = 0 - matched.append((mtime, str(p))) - if len(matched) > _CODENAV_MAX_HITS * 5: + for dp, dns, fns in os.walk(base): + # Prune skipped dirs before descending (unlike rglob which + # descends first then filters — fatal on large node_modules). + dns[:] = [d for d in dns if d not in _CODENAV_SKIP_DIRS] + for name in fns + dns: + full = os.path.join(dp, name) + rel = os.path.relpath(full, base).replace(os.sep, "/") + if regex.fullmatch(rel) or regex.fullmatch(name): + try: + mtime = os.stat(full).st_mtime + except OSError: + mtime = 0 + matched.append((mtime, full)) + if len(matched) > cap: break - except (OSError, ValueError) as _e: + except OSError as _e: return None, f"glob: {_e}" matched.sort(key=lambda t: t[0], reverse=True) return [pth for _, pth in matched[:_CODENAV_MAX_HITS]], None diff --git a/tests/test_code_nav_tools.py b/tests/test_code_nav_tools.py index 40e9b2ba0..d02840e18 100644 --- a/tests/test_code_nav_tools.py +++ b/tests/test_code_nav_tools.py @@ -24,6 +24,9 @@ def repo(): os.mkdir(os.path.join(root, "sub")) with open(os.path.join(root, "sub", "b.txt"), "w") as f: f.write("nothing\nNEEDLE upper\n") + os.mkdir(os.path.join(root, "sub", "deep")) + with open(os.path.join(root, "sub", "deep", "c.py"), "w") as f: + f.write("# deep python\n") os.mkdir(os.path.join(root, "node_modules")) with open(os.path.join(root, "node_modules", "dep.py"), "w") as f: f.write("needle in dep\n") @@ -107,6 +110,37 @@ def test_glob_requires_pattern(repo): assert r["exit_code"] == 1 +def test_glob_literal_in_subdir(repo): + """Bare literal should match at any depth (like rglob), not only at root.""" + r = _run("glob", f'{{"pattern": "b.txt", "path": "{repo}"}}') + assert r["exit_code"] == 0 + assert "b.txt" in r["output"] + + +def test_glob_multi_segment_single_star(repo): + """sub/*.txt matches sub/b.txt but NOT sub/deep/c.py (single * stays in one segment).""" + r = _run("glob", f'{{"pattern": "sub/*.txt", "path": "{repo}"}}') + assert r["exit_code"] == 0 + assert "b.txt" in r["output"] + assert "c.py" not in r["output"] + + +def test_glob_star_does_not_cross_slash(repo): + """src/*.py must NOT match src/a/b/x.py — * is single-segment only.""" + r = _run("glob", f'{{"pattern": "sub/*.py", "path": "{repo}"}}') + assert r["exit_code"] == 0 + # sub/ has no .py directly, only sub/deep/c.py — should NOT match + assert "No files matching" in r["output"] + + +def test_glob_double_star_matches_deep(repo): + """**/*.py should match files at any depth.""" + r = _run("glob", f'{{"pattern": "**/*.py", "path": "{repo}"}}') + assert r["exit_code"] == 0 + assert "a.py" in r["output"] + assert "c.py" in r["output"] + + # ── ls ──────────────────────────────────────────────────────────────────── def test_ls_lists_entries(repo):