mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-23 05:05:24 -04:00
fix(tools): prune skipped dirs before descending in glob tool (#4538)
* fix(tools): prune skipped dirs before descending in glob tool GlobTool used pathlib.Path.rglob which descends into every directory (including node_modules, .git, dist, etc.) and filters AFTER the walk. On repos with large junk directories this causes the glob tool to hang for minutes. Replace rglob with os.walk that prunes _CODENAV_SKIP_DIRS before descending — matching the approach GrepTool already uses. Also add a fast path for literal patterns (no wildcards → direct path lookup). Fixes #4493 * fix(tools): use regex glob matching to fix * semantics and literal fallback Replace fnmatch with _glob_to_regex so that * stays within a single path segment (matching pathlib/rglob semantics) and **/ spans zero or more directories. Literal patterns now fall through to os.walk when the direct path lookup misses, so e.g. 'foo.py' still finds files at any depth. Add tests for: - bare literal matching in subdirectories - multi-segment single-star patterns (sub/*.txt) - * not crossing / boundaries - ** matching at arbitrary depth Closes #4493 --------- Co-authored-by: michaelxer <michaelxer@users.noreply.github.com>
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import difflib
|
||||
import fnmatch
|
||||
import shutil
|
||||
@@ -16,6 +17,31 @@ _CODENAV_SKIP_DIRS = frozenset({
|
||||
_CODENAV_MAX_HITS = 200
|
||||
_CODENAV_MAX_LINE = 400
|
||||
|
||||
|
||||
def _glob_to_regex(pat: str) -> "re.Pattern":
|
||||
"""Translate a forward-slash glob (**, *, ?) into a compiled regex.
|
||||
`**/` matches zero or more complete directories.
|
||||
`*` matches within a single path segment (does not cross /).
|
||||
"""
|
||||
i, n, out = 0, len(pat), []
|
||||
while i < n:
|
||||
if pat[i : i + 3] == "**/":
|
||||
out.append("(?:[^/]+/)*")
|
||||
i += 3
|
||||
elif pat[i : i + 2] == "**":
|
||||
out.append(".*")
|
||||
i += 2
|
||||
elif pat[i] == "*":
|
||||
out.append("[^/]*")
|
||||
i += 1
|
||||
elif pat[i] == "?":
|
||||
out.append("[^/]")
|
||||
i += 1
|
||||
else:
|
||||
out.append(re.escape(pat[i]))
|
||||
i += 1
|
||||
return re.compile("".join(out))
|
||||
|
||||
def _unified_diff(old: str, new: str, path: str) -> Optional[Dict[str, Any]]:
|
||||
if old == new:
|
||||
return None
|
||||
@@ -259,23 +285,38 @@ class GlobTool:
|
||||
return {"error": f"glob: {e}", "exit_code": 1}
|
||||
|
||||
def _glob():
|
||||
from pathlib import Path
|
||||
base = Path(root)
|
||||
if not base.is_dir():
|
||||
base = os.path.abspath(root)
|
||||
if not os.path.isdir(base):
|
||||
return None, f"glob: {root}: not a directory"
|
||||
norm_pat = pattern.replace("\\", "/")
|
||||
# Fast path: literal pattern (no wildcards) → direct path lookup.
|
||||
if not any(c in norm_pat for c in "*?["):
|
||||
cand = os.path.normpath(os.path.join(base, norm_pat))
|
||||
if os.path.exists(cand):
|
||||
return [cand], None
|
||||
# Literal not at exact path — fall through to walk so
|
||||
# e.g. "foo.py" still matches at any depth (like rglob).
|
||||
# Compile glob to regex: * stays within one segment, **/ spans dirs.
|
||||
regex = _glob_to_regex(norm_pat)
|
||||
matched = []
|
||||
cap = _CODENAV_MAX_HITS * 5
|
||||
try:
|
||||
for p in base.rglob(pattern):
|
||||
if set(p.relative_to(base).parts) & _CODENAV_SKIP_DIRS:
|
||||
continue
|
||||
try:
|
||||
mtime = p.stat().st_mtime
|
||||
except OSError:
|
||||
mtime = 0
|
||||
matched.append((mtime, str(p)))
|
||||
if len(matched) > _CODENAV_MAX_HITS * 5:
|
||||
for dp, dns, fns in os.walk(base):
|
||||
# Prune skipped dirs before descending (unlike rglob which
|
||||
# descends first then filters — fatal on large node_modules).
|
||||
dns[:] = [d for d in dns if d not in _CODENAV_SKIP_DIRS]
|
||||
for name in fns + dns:
|
||||
full = os.path.join(dp, name)
|
||||
rel = os.path.relpath(full, base).replace(os.sep, "/")
|
||||
if regex.fullmatch(rel) or regex.fullmatch(name):
|
||||
try:
|
||||
mtime = os.stat(full).st_mtime
|
||||
except OSError:
|
||||
mtime = 0
|
||||
matched.append((mtime, full))
|
||||
if len(matched) > cap:
|
||||
break
|
||||
except (OSError, ValueError) as _e:
|
||||
except OSError as _e:
|
||||
return None, f"glob: {_e}"
|
||||
matched.sort(key=lambda t: t[0], reverse=True)
|
||||
return [pth for _, pth in matched[:_CODENAV_MAX_HITS]], None
|
||||
|
||||
Reference in New Issue
Block a user