test(taxonomy): auto-mark tests by area and sub-area (#3491)

2026-06-15 17:25:26 -04:00 · 2026-06-09 00:13:28 +01:00
parent e7c1d75884
commit a240f28af9
5 changed files with 380 additions and 4 deletions
@@ -0,0 +1,162 @@
+"""Conservative test taxonomy: classify test files by area and sub-area.
+
+This module is the single source of truth for the collection-time markers added
+in ``tests/conftest.py``. It performs no inference beyond simple, exact matching
+of filename tokens against small, explicit keyword sets. A file is matched to
+the first area (in priority order) whose keyword set intersects its filename
+tokens; files that match no area fall back to ``uncategorized`` with the
+filename itself as the sub-area.
+
+The categories mirror ``tests/TESTING_STANDARD.md``. This module imports nothing
+from the application - only the standard library - and changes no test behavior.
+"""
+from __future__ import annotations
+
+import re
+from collections.abc import Iterable
+from dataclasses import dataclass
+from pathlib import Path
+
+# Area keyword sets. Keep these small and explicit; prefer leaving a file
+# ``uncategorized`` over guessing. Matching is exact, token-by-token.
+SECURITY_KEYWORDS = frozenset({
+    "security", "auth", "owner", "scope",
+    "ssrf", "xss", "confinement", "permission", "redaction",
+})
+CLI_KEYWORDS = frozenset({"cli"})
+ROUTES_KEYWORDS = frozenset({"route", "routes", "api"})
+SERVICES_KEYWORDS = frozenset({
+    "llm", "provider", "cookbook", "session", "history", "email",
+    "calendar", "memory", "gallery", "document", "research", "mcp",
+    "scheduler", "webhook", "embedding",
+})
+UNIT_KEYWORDS = frozenset({
+    "parse", "parser", "parsing", "nonstring", "nondict",
+    "atomic", "regex", "tokenize",
+})
+
+# Keyword-matched areas, in priority order (first match wins). Security is a
+# cross-cutting concern and intentionally outranks the feature areas, so e.g.
+# ``test_email_owner_scope.py`` classifies as ``security``, not ``services``.
+# ``js`` and ``helpers`` are matched by dedicated rules in ``_match_area``.
+KEYWORD_AREAS = (
+    ("security", SECURITY_KEYWORDS),
+    ("cli", CLI_KEYWORDS),
+    ("routes", ROUTES_KEYWORDS),
+    ("services", SERVICES_KEYWORDS),
+    ("unit", UNIT_KEYWORDS),
+)
+
+# File extensions that indicate a JavaScript/Node-backed test.
+JS_EXTENSIONS = frozenset({".js", ".mjs", ".ts"})
+
+UNCATEGORIZED = "uncategorized"
+
+
+@dataclass(frozen=True)
+class TestClassification:
+    """Area and sub-area for a single test file."""
+
+    area: str
+    sub_area: str
+
+
+def normalize_marker_name(value: str) -> str:
+    """Lowercase ``value`` and reduce it to a marker-safe ``[a-z0-9_]`` token."""
+    lowered = value.lower()
+    collapsed = re.sub(r"[^a-z0-9]+", "_", lowered)
+    return collapsed.strip("_")
+
+
+def _stem(path: str | Path) -> str:
+    """Filename without its extension chain (``invariant.test.mjs`` -> ``invariant``)."""
+    return Path(path).name.split(".", 1)[0]
+
+
+def _extension(path: str | Path) -> str:
+    """Lowercased final file extension, e.g. ``.py`` or ``.mjs``."""
+    return Path(path).suffix.lower()
+
+
+def _filename_tokens(path: str | Path) -> tuple[str, ...]:
+    """Underscore tokens of the filename stem, with a leading ``test`` dropped."""
+    tokens = tuple(t for t in normalize_marker_name(_stem(path)).split("_") if t)
+    if tokens and tokens[0] == "test":
+        tokens = tokens[1:]
+    return tokens
+
+
+def _matched_keywords(tokens: tuple[str, ...], keywords: frozenset[str]) -> tuple[str, ...]:
+    """Filename tokens that appear in ``keywords``, in order, de-duplicated."""
+    matched: list[str] = []
+    for token in tokens:
+        if token in keywords and token not in matched:
+            matched.append(token)
+    return tuple(matched)
+
+
+def _match_area(tokens: tuple[str, ...], extension: str) -> tuple[str, tuple[str, ...]]:
+    """Return ``(area, matched_keywords)`` using the conservative priority order."""
+    if extension in JS_EXTENSIONS or "js" in tokens:
+        return "js", ("js",)
+    if tokens and tokens[0] == "helpers":
+        return "helpers", ("helpers",)
+    for area, keywords in KEYWORD_AREAS:
+        matched = _matched_keywords(tokens, keywords)
+        if matched:
+            return area, matched
+    return UNCATEGORIZED, ()
+
+
+def _sub_area(area: str, matched: tuple[str, ...], tokens: tuple[str, ...]) -> str:
+    """Derive the sub-area: matched keywords for a known area, else the filename."""
+    if area == UNCATEGORIZED:
+        return "_".join(tokens)
+    return "_".join(matched)
+
+
+def _in_helpers_dir(path: str | Path) -> bool:
+    """True if ``path`` is under the test helper dir ``tests/helpers/``.
+
+    Matches the exact adjacent ``tests``/``helpers`` component pair, so an
+    unrelated ancestor directory merely named ``helpers`` does not count.
+    """
+    parts = Path(path).parent.parts
+    adjacent_pairs = list(zip(parts, parts[1:]))
+    return ("tests", "helpers") in adjacent_pairs
+
+
+def classify_test_path(path: str | Path) -> TestClassification:
+    """Classify a test file path into an area and a sub-area.
+
+    A test file under a ``helpers`` directory is a helper self-test regardless of
+    its filename, which complements the filename first-token rule in
+    ``_match_area`` (e.g. ``test_helpers_import_state.py`` in ``tests/``).
+    """
+    if _in_helpers_dir(path):
+        return TestClassification(area="helpers", sub_area="helpers")
+    tokens = _filename_tokens(path)
+    area, matched = _match_area(tokens, _extension(path))
+    sub_area = _sub_area(area, matched, tokens) or UNCATEGORIZED
+    return TestClassification(area=area, sub_area=sub_area)
+
+
+def markers_for_path(path: str | Path) -> tuple[str, ...]:
+    """Return the ``(area_*, sub_*)`` marker names for a test file path."""
+    classification = classify_test_path(path)
+    area_marker = normalize_marker_name(f"area_{classification.area}")
+    sub_marker = normalize_marker_name(f"sub_{classification.sub_area}")
+    return (area_marker, sub_marker)
+
+
+def discover_markers(paths: Iterable[str | Path]) -> tuple[str, ...]:
+    """Distinct ``area_*`` / ``sub_*`` marker names for ``paths``, sorted.
+
+    Pure: it derives names from the given paths only and performs no filesystem
+    access of its own. The caller decides which paths to scan. Used at
+    ``pytest_configure`` time to register the dynamic ``sub_*`` markers.
+    """
+    names: set[str] = set()
+    for path in paths:
+        names.update(markers_for_path(path))
+    return tuple(sorted(names))