mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-15 17:25:26 -04:00
test(taxonomy): auto-mark tests by area and sub-area (#3491)
This commit is contained in:
committed by
GitHub
parent
e7c1d75884
commit
a240f28af9
@@ -0,0 +1,162 @@
|
||||
"""Conservative test taxonomy: classify test files by area and sub-area.
|
||||
|
||||
This module is the single source of truth for the collection-time markers added
|
||||
in ``tests/conftest.py``. It performs no inference beyond simple, exact matching
|
||||
of filename tokens against small, explicit keyword sets. A file is matched to
|
||||
the first area (in priority order) whose keyword set intersects its filename
|
||||
tokens; files that match no area fall back to ``uncategorized`` with the
|
||||
filename itself as the sub-area.
|
||||
|
||||
The categories mirror ``tests/TESTING_STANDARD.md``. This module imports nothing
|
||||
from the application - only the standard library - and changes no test behavior.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from collections.abc import Iterable
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
# Area keyword sets. Keep these small and explicit; prefer leaving a file
|
||||
# ``uncategorized`` over guessing. Matching is exact, token-by-token.
|
||||
SECURITY_KEYWORDS = frozenset({
|
||||
"security", "auth", "owner", "scope",
|
||||
"ssrf", "xss", "confinement", "permission", "redaction",
|
||||
})
|
||||
CLI_KEYWORDS = frozenset({"cli"})
|
||||
ROUTES_KEYWORDS = frozenset({"route", "routes", "api"})
|
||||
SERVICES_KEYWORDS = frozenset({
|
||||
"llm", "provider", "cookbook", "session", "history", "email",
|
||||
"calendar", "memory", "gallery", "document", "research", "mcp",
|
||||
"scheduler", "webhook", "embedding",
|
||||
})
|
||||
UNIT_KEYWORDS = frozenset({
|
||||
"parse", "parser", "parsing", "nonstring", "nondict",
|
||||
"atomic", "regex", "tokenize",
|
||||
})
|
||||
|
||||
# Keyword-matched areas, in priority order (first match wins). Security is a
|
||||
# cross-cutting concern and intentionally outranks the feature areas, so e.g.
|
||||
# ``test_email_owner_scope.py`` classifies as ``security``, not ``services``.
|
||||
# ``js`` and ``helpers`` are matched by dedicated rules in ``_match_area``.
|
||||
KEYWORD_AREAS = (
|
||||
("security", SECURITY_KEYWORDS),
|
||||
("cli", CLI_KEYWORDS),
|
||||
("routes", ROUTES_KEYWORDS),
|
||||
("services", SERVICES_KEYWORDS),
|
||||
("unit", UNIT_KEYWORDS),
|
||||
)
|
||||
|
||||
# File extensions that indicate a JavaScript/Node-backed test.
|
||||
JS_EXTENSIONS = frozenset({".js", ".mjs", ".ts"})
|
||||
|
||||
UNCATEGORIZED = "uncategorized"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class TestClassification:
|
||||
"""Area and sub-area for a single test file."""
|
||||
|
||||
area: str
|
||||
sub_area: str
|
||||
|
||||
|
||||
def normalize_marker_name(value: str) -> str:
|
||||
"""Lowercase ``value`` and reduce it to a marker-safe ``[a-z0-9_]`` token."""
|
||||
lowered = value.lower()
|
||||
collapsed = re.sub(r"[^a-z0-9]+", "_", lowered)
|
||||
return collapsed.strip("_")
|
||||
|
||||
|
||||
def _stem(path: str | Path) -> str:
|
||||
"""Filename without its extension chain (``invariant.test.mjs`` -> ``invariant``)."""
|
||||
return Path(path).name.split(".", 1)[0]
|
||||
|
||||
|
||||
def _extension(path: str | Path) -> str:
|
||||
"""Lowercased final file extension, e.g. ``.py`` or ``.mjs``."""
|
||||
return Path(path).suffix.lower()
|
||||
|
||||
|
||||
def _filename_tokens(path: str | Path) -> tuple[str, ...]:
|
||||
"""Underscore tokens of the filename stem, with a leading ``test`` dropped."""
|
||||
tokens = tuple(t for t in normalize_marker_name(_stem(path)).split("_") if t)
|
||||
if tokens and tokens[0] == "test":
|
||||
tokens = tokens[1:]
|
||||
return tokens
|
||||
|
||||
|
||||
def _matched_keywords(tokens: tuple[str, ...], keywords: frozenset[str]) -> tuple[str, ...]:
|
||||
"""Filename tokens that appear in ``keywords``, in order, de-duplicated."""
|
||||
matched: list[str] = []
|
||||
for token in tokens:
|
||||
if token in keywords and token not in matched:
|
||||
matched.append(token)
|
||||
return tuple(matched)
|
||||
|
||||
|
||||
def _match_area(tokens: tuple[str, ...], extension: str) -> tuple[str, tuple[str, ...]]:
|
||||
"""Return ``(area, matched_keywords)`` using the conservative priority order."""
|
||||
if extension in JS_EXTENSIONS or "js" in tokens:
|
||||
return "js", ("js",)
|
||||
if tokens and tokens[0] == "helpers":
|
||||
return "helpers", ("helpers",)
|
||||
for area, keywords in KEYWORD_AREAS:
|
||||
matched = _matched_keywords(tokens, keywords)
|
||||
if matched:
|
||||
return area, matched
|
||||
return UNCATEGORIZED, ()
|
||||
|
||||
|
||||
def _sub_area(area: str, matched: tuple[str, ...], tokens: tuple[str, ...]) -> str:
|
||||
"""Derive the sub-area: matched keywords for a known area, else the filename."""
|
||||
if area == UNCATEGORIZED:
|
||||
return "_".join(tokens)
|
||||
return "_".join(matched)
|
||||
|
||||
|
||||
def _in_helpers_dir(path: str | Path) -> bool:
|
||||
"""True if ``path`` is under the test helper dir ``tests/helpers/``.
|
||||
|
||||
Matches the exact adjacent ``tests``/``helpers`` component pair, so an
|
||||
unrelated ancestor directory merely named ``helpers`` does not count.
|
||||
"""
|
||||
parts = Path(path).parent.parts
|
||||
adjacent_pairs = list(zip(parts, parts[1:]))
|
||||
return ("tests", "helpers") in adjacent_pairs
|
||||
|
||||
|
||||
def classify_test_path(path: str | Path) -> TestClassification:
|
||||
"""Classify a test file path into an area and a sub-area.
|
||||
|
||||
A test file under a ``helpers`` directory is a helper self-test regardless of
|
||||
its filename, which complements the filename first-token rule in
|
||||
``_match_area`` (e.g. ``test_helpers_import_state.py`` in ``tests/``).
|
||||
"""
|
||||
if _in_helpers_dir(path):
|
||||
return TestClassification(area="helpers", sub_area="helpers")
|
||||
tokens = _filename_tokens(path)
|
||||
area, matched = _match_area(tokens, _extension(path))
|
||||
sub_area = _sub_area(area, matched, tokens) or UNCATEGORIZED
|
||||
return TestClassification(area=area, sub_area=sub_area)
|
||||
|
||||
|
||||
def markers_for_path(path: str | Path) -> tuple[str, ...]:
|
||||
"""Return the ``(area_*, sub_*)`` marker names for a test file path."""
|
||||
classification = classify_test_path(path)
|
||||
area_marker = normalize_marker_name(f"area_{classification.area}")
|
||||
sub_marker = normalize_marker_name(f"sub_{classification.sub_area}")
|
||||
return (area_marker, sub_marker)
|
||||
|
||||
|
||||
def discover_markers(paths: Iterable[str | Path]) -> tuple[str, ...]:
|
||||
"""Distinct ``area_*`` / ``sub_*`` marker names for ``paths``, sorted.
|
||||
|
||||
Pure: it derives names from the given paths only and performs no filesystem
|
||||
access of its own. The caller decides which paths to scan. Used at
|
||||
``pytest_configure`` time to register the dynamic ``sub_*`` markers.
|
||||
"""
|
||||
names: set[str] = set()
|
||||
for path in paths:
|
||||
names.update(markers_for_path(path))
|
||||
return tuple(sorted(names))
|
||||
Reference in New Issue
Block a user