Files
2026-06-09 01:13:28 +02:00

163 lines
6.1 KiB
Python

"""Conservative test taxonomy: classify test files by area and sub-area.
This module is the single source of truth for the collection-time markers added
in ``tests/conftest.py``. It performs no inference beyond simple, exact matching
of filename tokens against small, explicit keyword sets. A file is matched to
the first area (in priority order) whose keyword set intersects its filename
tokens; files that match no area fall back to ``uncategorized`` with the
filename itself as the sub-area.
The categories mirror ``tests/TESTING_STANDARD.md``. This module imports nothing
from the application - only the standard library - and changes no test behavior.
"""
from __future__ import annotations
import re
from collections.abc import Iterable
from dataclasses import dataclass
from pathlib import Path
# Area keyword sets. Keep these small and explicit; prefer leaving a file
# ``uncategorized`` over guessing. Matching is exact, token-by-token.
SECURITY_KEYWORDS = frozenset({
"security", "auth", "owner", "scope",
"ssrf", "xss", "confinement", "permission", "redaction",
})
CLI_KEYWORDS = frozenset({"cli"})
ROUTES_KEYWORDS = frozenset({"route", "routes", "api"})
SERVICES_KEYWORDS = frozenset({
"llm", "provider", "cookbook", "session", "history", "email",
"calendar", "memory", "gallery", "document", "research", "mcp",
"scheduler", "webhook", "embedding",
})
UNIT_KEYWORDS = frozenset({
"parse", "parser", "parsing", "nonstring", "nondict",
"atomic", "regex", "tokenize",
})
# Keyword-matched areas, in priority order (first match wins). Security is a
# cross-cutting concern and intentionally outranks the feature areas, so e.g.
# ``test_email_owner_scope.py`` classifies as ``security``, not ``services``.
# ``js`` and ``helpers`` are matched by dedicated rules in ``_match_area``.
KEYWORD_AREAS = (
("security", SECURITY_KEYWORDS),
("cli", CLI_KEYWORDS),
("routes", ROUTES_KEYWORDS),
("services", SERVICES_KEYWORDS),
("unit", UNIT_KEYWORDS),
)
# File extensions that indicate a JavaScript/Node-backed test.
JS_EXTENSIONS = frozenset({".js", ".mjs", ".ts"})
UNCATEGORIZED = "uncategorized"
@dataclass(frozen=True)
class TestClassification:
"""Area and sub-area for a single test file."""
area: str
sub_area: str
def normalize_marker_name(value: str) -> str:
"""Lowercase ``value`` and reduce it to a marker-safe ``[a-z0-9_]`` token."""
lowered = value.lower()
collapsed = re.sub(r"[^a-z0-9]+", "_", lowered)
return collapsed.strip("_")
def _stem(path: str | Path) -> str:
"""Filename without its extension chain (``invariant.test.mjs`` -> ``invariant``)."""
return Path(path).name.split(".", 1)[0]
def _extension(path: str | Path) -> str:
"""Lowercased final file extension, e.g. ``.py`` or ``.mjs``."""
return Path(path).suffix.lower()
def _filename_tokens(path: str | Path) -> tuple[str, ...]:
"""Underscore tokens of the filename stem, with a leading ``test`` dropped."""
tokens = tuple(t for t in normalize_marker_name(_stem(path)).split("_") if t)
if tokens and tokens[0] == "test":
tokens = tokens[1:]
return tokens
def _matched_keywords(tokens: tuple[str, ...], keywords: frozenset[str]) -> tuple[str, ...]:
"""Filename tokens that appear in ``keywords``, in order, de-duplicated."""
matched: list[str] = []
for token in tokens:
if token in keywords and token not in matched:
matched.append(token)
return tuple(matched)
def _match_area(tokens: tuple[str, ...], extension: str) -> tuple[str, tuple[str, ...]]:
"""Return ``(area, matched_keywords)`` using the conservative priority order."""
if extension in JS_EXTENSIONS or "js" in tokens:
return "js", ("js",)
if tokens and tokens[0] == "helpers":
return "helpers", ("helpers",)
for area, keywords in KEYWORD_AREAS:
matched = _matched_keywords(tokens, keywords)
if matched:
return area, matched
return UNCATEGORIZED, ()
def _sub_area(area: str, matched: tuple[str, ...], tokens: tuple[str, ...]) -> str:
"""Derive the sub-area: matched keywords for a known area, else the filename."""
if area == UNCATEGORIZED:
return "_".join(tokens)
return "_".join(matched)
def _in_helpers_dir(path: str | Path) -> bool:
"""True if ``path`` is under the test helper dir ``tests/helpers/``.
Matches the exact adjacent ``tests``/``helpers`` component pair, so an
unrelated ancestor directory merely named ``helpers`` does not count.
"""
parts = Path(path).parent.parts
adjacent_pairs = list(zip(parts, parts[1:]))
return ("tests", "helpers") in adjacent_pairs
def classify_test_path(path: str | Path) -> TestClassification:
"""Classify a test file path into an area and a sub-area.
A test file under a ``helpers`` directory is a helper self-test regardless of
its filename, which complements the filename first-token rule in
``_match_area`` (e.g. ``test_helpers_import_state.py`` in ``tests/``).
"""
if _in_helpers_dir(path):
return TestClassification(area="helpers", sub_area="helpers")
tokens = _filename_tokens(path)
area, matched = _match_area(tokens, _extension(path))
sub_area = _sub_area(area, matched, tokens) or UNCATEGORIZED
return TestClassification(area=area, sub_area=sub_area)
def markers_for_path(path: str | Path) -> tuple[str, ...]:
"""Return the ``(area_*, sub_*)`` marker names for a test file path."""
classification = classify_test_path(path)
area_marker = normalize_marker_name(f"area_{classification.area}")
sub_marker = normalize_marker_name(f"sub_{classification.sub_area}")
return (area_marker, sub_marker)
def discover_markers(paths: Iterable[str | Path]) -> tuple[str, ...]:
"""Distinct ``area_*`` / ``sub_*`` marker names for ``paths``, sorted.
Pure: it derives names from the given paths only and performs no filesystem
access of its own. The caller decides which paths to scan. Used at
``pytest_configure`` time to register the dynamic ``sub_*`` markers.
"""
names: set[str] = set()
for path in paths:
names.update(markers_for_path(path))
return tuple(sorted(names))