"""Conservative test taxonomy: classify test files by area and sub-area. This module is the single source of truth for the collection-time markers added in ``tests/conftest.py``. It performs no inference beyond simple, exact matching of filename tokens against small, explicit keyword sets. A file is matched to the first area (in priority order) whose keyword set intersects its filename tokens; files that match no area fall back to ``uncategorized`` with the filename itself as the sub-area. The categories mirror ``tests/TESTING_STANDARD.md``. This module imports nothing from the application - only the standard library - and changes no test behavior. """ from __future__ import annotations import re from collections.abc import Iterable from dataclasses import dataclass from pathlib import Path # Area keyword sets. Keep these small and explicit; prefer leaving a file # ``uncategorized`` over guessing. Matching is exact, token-by-token. SECURITY_KEYWORDS = frozenset({ "security", "auth", "owner", "scope", "ssrf", "xss", "confinement", "permission", "redaction", }) CLI_KEYWORDS = frozenset({"cli"}) ROUTES_KEYWORDS = frozenset({"route", "routes", "api"}) SERVICES_KEYWORDS = frozenset({ "llm", "provider", "cookbook", "session", "history", "email", "calendar", "memory", "gallery", "document", "research", "mcp", "scheduler", "webhook", "embedding", }) UNIT_KEYWORDS = frozenset({ "parse", "parser", "parsing", "nonstring", "nondict", "atomic", "regex", "tokenize", }) # Keyword-matched areas, in priority order (first match wins). Security is a # cross-cutting concern and intentionally outranks the feature areas, so e.g. # ``test_email_owner_scope.py`` classifies as ``security``, not ``services``. # ``js`` and ``helpers`` are matched by dedicated rules in ``_match_area``. KEYWORD_AREAS = ( ("security", SECURITY_KEYWORDS), ("cli", CLI_KEYWORDS), ("routes", ROUTES_KEYWORDS), ("services", SERVICES_KEYWORDS), ("unit", UNIT_KEYWORDS), ) # File extensions that indicate a JavaScript/Node-backed test. JS_EXTENSIONS = frozenset({".js", ".mjs", ".ts"}) UNCATEGORIZED = "uncategorized" @dataclass(frozen=True) class TestClassification: """Area and sub-area for a single test file.""" area: str sub_area: str def normalize_marker_name(value: str) -> str: """Lowercase ``value`` and reduce it to a marker-safe ``[a-z0-9_]`` token.""" lowered = value.lower() collapsed = re.sub(r"[^a-z0-9]+", "_", lowered) return collapsed.strip("_") def _stem(path: str | Path) -> str: """Filename without its extension chain (``invariant.test.mjs`` -> ``invariant``).""" return Path(path).name.split(".", 1)[0] def _extension(path: str | Path) -> str: """Lowercased final file extension, e.g. ``.py`` or ``.mjs``.""" return Path(path).suffix.lower() def _filename_tokens(path: str | Path) -> tuple[str, ...]: """Underscore tokens of the filename stem, with a leading ``test`` dropped.""" tokens = tuple(t for t in normalize_marker_name(_stem(path)).split("_") if t) if tokens and tokens[0] == "test": tokens = tokens[1:] return tokens def _matched_keywords(tokens: tuple[str, ...], keywords: frozenset[str]) -> tuple[str, ...]: """Filename tokens that appear in ``keywords``, in order, de-duplicated.""" matched: list[str] = [] for token in tokens: if token in keywords and token not in matched: matched.append(token) return tuple(matched) def _match_area(tokens: tuple[str, ...], extension: str) -> tuple[str, tuple[str, ...]]: """Return ``(area, matched_keywords)`` using the conservative priority order.""" if extension in JS_EXTENSIONS or "js" in tokens: return "js", ("js",) if tokens and tokens[0] == "helpers": return "helpers", ("helpers",) for area, keywords in KEYWORD_AREAS: matched = _matched_keywords(tokens, keywords) if matched: return area, matched return UNCATEGORIZED, () def _sub_area(area: str, matched: tuple[str, ...], tokens: tuple[str, ...]) -> str: """Derive the sub-area: matched keywords for a known area, else the filename.""" if area == UNCATEGORIZED: return "_".join(tokens) return "_".join(matched) def _in_helpers_dir(path: str | Path) -> bool: """True if ``path`` is under the test helper dir ``tests/helpers/``. Matches the exact adjacent ``tests``/``helpers`` component pair, so an unrelated ancestor directory merely named ``helpers`` does not count. """ parts = Path(path).parent.parts adjacent_pairs = list(zip(parts, parts[1:])) return ("tests", "helpers") in adjacent_pairs def classify_test_path(path: str | Path) -> TestClassification: """Classify a test file path into an area and a sub-area. A test file under a ``helpers`` directory is a helper self-test regardless of its filename, which complements the filename first-token rule in ``_match_area`` (e.g. ``test_helpers_import_state.py`` in ``tests/``). """ if _in_helpers_dir(path): return TestClassification(area="helpers", sub_area="helpers") tokens = _filename_tokens(path) area, matched = _match_area(tokens, _extension(path)) sub_area = _sub_area(area, matched, tokens) or UNCATEGORIZED return TestClassification(area=area, sub_area=sub_area) def markers_for_path(path: str | Path) -> tuple[str, ...]: """Return the ``(area_*, sub_*)`` marker names for a test file path.""" classification = classify_test_path(path) area_marker = normalize_marker_name(f"area_{classification.area}") sub_marker = normalize_marker_name(f"sub_{classification.sub_area}") return (area_marker, sub_marker) def discover_markers(paths: Iterable[str | Path]) -> tuple[str, ...]: """Distinct ``area_*`` / ``sub_*`` marker names for ``paths``, sorted. Pure: it derives names from the given paths only and performs no filesystem access of its own. The caller decides which paths to scan. Used at ``pytest_configure`` time to register the dynamic ``sub_*`` markers. """ names: set[str] = set() for path in paths: names.update(markers_for_path(path)) return tuple(sorted(names))