fix(tools): prune skipped dirs before descending in glob tool (#4538)

* fix(tools): prune skipped dirs before descending in glob tool

GlobTool used pathlib.Path.rglob which descends into every directory
(including node_modules, .git, dist, etc.) and filters AFTER the walk.
On repos with large junk directories this causes the glob tool to hang
for minutes.

Replace rglob with os.walk that prunes _CODENAV_SKIP_DIRS before
descending — matching the approach GrepTool already uses. Also add a
fast path for literal patterns (no wildcards → direct path lookup).

Fixes #4493

* fix(tools): use regex glob matching to fix * semantics and literal fallback

Replace fnmatch with _glob_to_regex so that * stays within a single
path segment (matching pathlib/rglob semantics) and **/ spans zero or
more directories.  Literal patterns now fall through to os.walk when
the direct path lookup misses, so e.g. 'foo.py' still finds files at
any depth.

Add tests for:
- bare literal matching in subdirectories
- multi-segment single-star patterns (sub/*.txt)
- * not crossing / boundaries
- ** matching at arbitrary depth

Closes #4493

---------

Co-authored-by: michaelxer <michaelxer@users.noreply.github.com>
This commit is contained in:
Michael
2026-06-19 03:02:29 +07:00
committed by GitHub
parent 1cc8a373b0
commit 39a802bea2
2 changed files with 88 additions and 13 deletions
+54 -13
View File
@@ -1,6 +1,7 @@
import asyncio
import json
import os
import re
import difflib
import fnmatch
import shutil
@@ -16,6 +17,31 @@ _CODENAV_SKIP_DIRS = frozenset({
_CODENAV_MAX_HITS = 200
_CODENAV_MAX_LINE = 400
def _glob_to_regex(pat: str) -> "re.Pattern":
"""Translate a forward-slash glob (**, *, ?) into a compiled regex.
`**/` matches zero or more complete directories.
`*` matches within a single path segment (does not cross /).
"""
i, n, out = 0, len(pat), []
while i < n:
if pat[i : i + 3] == "**/":
out.append("(?:[^/]+/)*")
i += 3
elif pat[i : i + 2] == "**":
out.append(".*")
i += 2
elif pat[i] == "*":
out.append("[^/]*")
i += 1
elif pat[i] == "?":
out.append("[^/]")
i += 1
else:
out.append(re.escape(pat[i]))
i += 1
return re.compile("".join(out))
def _unified_diff(old: str, new: str, path: str) -> Optional[Dict[str, Any]]:
if old == new:
return None
@@ -259,23 +285,38 @@ class GlobTool:
return {"error": f"glob: {e}", "exit_code": 1}
def _glob():
from pathlib import Path
base = Path(root)
if not base.is_dir():
base = os.path.abspath(root)
if not os.path.isdir(base):
return None, f"glob: {root}: not a directory"
norm_pat = pattern.replace("\\", "/")
# Fast path: literal pattern (no wildcards) → direct path lookup.
if not any(c in norm_pat for c in "*?["):
cand = os.path.normpath(os.path.join(base, norm_pat))
if os.path.exists(cand):
return [cand], None
# Literal not at exact path — fall through to walk so
# e.g. "foo.py" still matches at any depth (like rglob).
# Compile glob to regex: * stays within one segment, **/ spans dirs.
regex = _glob_to_regex(norm_pat)
matched = []
cap = _CODENAV_MAX_HITS * 5
try:
for p in base.rglob(pattern):
if set(p.relative_to(base).parts) & _CODENAV_SKIP_DIRS:
continue
try:
mtime = p.stat().st_mtime
except OSError:
mtime = 0
matched.append((mtime, str(p)))
if len(matched) > _CODENAV_MAX_HITS * 5:
for dp, dns, fns in os.walk(base):
# Prune skipped dirs before descending (unlike rglob which
# descends first then filters — fatal on large node_modules).
dns[:] = [d for d in dns if d not in _CODENAV_SKIP_DIRS]
for name in fns + dns:
full = os.path.join(dp, name)
rel = os.path.relpath(full, base).replace(os.sep, "/")
if regex.fullmatch(rel) or regex.fullmatch(name):
try:
mtime = os.stat(full).st_mtime
except OSError:
mtime = 0
matched.append((mtime, full))
if len(matched) > cap:
break
except (OSError, ValueError) as _e:
except OSError as _e:
return None, f"glob: {_e}"
matched.sort(key=lambda t: t[0], reverse=True)
return [pth for _, pth in matched[:_CODENAV_MAX_HITS]], None
+34
View File
@@ -24,6 +24,9 @@ def repo():
os.mkdir(os.path.join(root, "sub"))
with open(os.path.join(root, "sub", "b.txt"), "w") as f:
f.write("nothing\nNEEDLE upper\n")
os.mkdir(os.path.join(root, "sub", "deep"))
with open(os.path.join(root, "sub", "deep", "c.py"), "w") as f:
f.write("# deep python\n")
os.mkdir(os.path.join(root, "node_modules"))
with open(os.path.join(root, "node_modules", "dep.py"), "w") as f:
f.write("needle in dep\n")
@@ -107,6 +110,37 @@ def test_glob_requires_pattern(repo):
assert r["exit_code"] == 1
def test_glob_literal_in_subdir(repo):
"""Bare literal should match at any depth (like rglob), not only at root."""
r = _run("glob", f'{{"pattern": "b.txt", "path": "{repo}"}}')
assert r["exit_code"] == 0
assert "b.txt" in r["output"]
def test_glob_multi_segment_single_star(repo):
"""sub/*.txt matches sub/b.txt but NOT sub/deep/c.py (single * stays in one segment)."""
r = _run("glob", f'{{"pattern": "sub/*.txt", "path": "{repo}"}}')
assert r["exit_code"] == 0
assert "b.txt" in r["output"]
assert "c.py" not in r["output"]
def test_glob_star_does_not_cross_slash(repo):
"""src/*.py must NOT match src/a/b/x.py — * is single-segment only."""
r = _run("glob", f'{{"pattern": "sub/*.py", "path": "{repo}"}}')
assert r["exit_code"] == 0
# sub/ has no .py directly, only sub/deep/c.py — should NOT match
assert "No files matching" in r["output"]
def test_glob_double_star_matches_deep(repo):
"""**/*.py should match files at any depth."""
r = _run("glob", f'{{"pattern": "**/*.py", "path": "{repo}"}}')
assert r["exit_code"] == 0
assert "a.py" in r["output"]
assert "c.py" in r["output"]
# ── ls ────────────────────────────────────────────────────────────────────
def test_ls_lists_entries(repo):