fix(search): apply recency UTC fix to live ranking module

This commit is contained in:
Alexandre Teixeira
2026-06-03 12:49:32 +01:00
committed by GitHub
parent 0deeba58ba
commit a75dd4a231
4 changed files with 85 additions and 169 deletions
+38 -19
View File
@@ -2,12 +2,49 @@
import re import re
import logging import logging
from datetime import datetime from datetime import datetime, timezone
from typing import List, Optional from typing import List, Optional
from urllib.parse import urlparse from urllib.parse import urlparse
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_AGE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S")
def _utcnow_naive() -> datetime:
"""Naive UTC 'now'. Matches the naive, UTC-style published dates parsed below,
and is safe on Python 3.14 where ``datetime.utcnow()`` is removed (#1116)."""
return datetime.now(timezone.utc).replace(tzinfo=None)
def recency_score(age_str: Optional[str], now: Optional[datetime] = None) -> float:
"""Score how recent a result is: 1.0 for <=7 days old, 0.0 for >=30 days.
The age is measured against UTC, not local time. The previous code used
``datetime.now()`` (local) against UTC-style published dates, so the age was
skewed by the host's UTC offset; it was also a latent crash once neighbouring
code moves to timezone-aware datetimes (#1116). ``now`` is injectable for tests.
"""
if not age_str:
return 0.0
dt = None
for fmt in _AGE_FORMATS:
try:
dt = datetime.strptime(age_str, fmt)
break
except Exception:
dt = None
if not dt:
return 0.0
now = now or _utcnow_naive()
days_old = (now - dt).days
if days_old <= 7:
return 1.0
if days_old >= 30:
return 0.0
return (30 - days_old) / 23
_NEWS_HINTS = {"news", "nyheter", "headlines", "breaking", "latest", "today", "idag"} _NEWS_HINTS = {"news", "nyheter", "headlines", "breaking", "latest", "today", "idag"}
_SPORTS_HINTS = { _SPORTS_HINTS = {
"sport", "sports", "soccer", "football", "hockey", "nba", "nfl", "mlb", "sport", "sports", "soccer", "football", "hockey", "nba", "nfl", "mlb",
@@ -73,24 +110,6 @@ def rank_search_results(query: str, results: List[dict]) -> List[dict]:
return 0.7 return 0.7
return 0.4 return 0.4
def recency_score(age_str: Optional[str]) -> float:
if not age_str:
return 0.0
for fmt in ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S"):
try:
dt = datetime.strptime(age_str, fmt)
break
except Exception:
dt = None
if not dt:
return 0.0
days_old = (datetime.now() - dt).days
if days_old <= 7:
return 1.0
if days_old >= 30:
return 0.0
return (30 - days_old) / 23
def news_quality_adjustment(title: str, snippet: str, url: str) -> float: def news_quality_adjustment(title: str, snippet: str, url: str) -> float:
if not is_news_query: if not is_news_query:
return 0.0 return 0.0
+10 -148
View File
@@ -1,151 +1,13 @@
"""Search result ranking based on relevance, source quality, and recency.""" """Compatibility re-export shim for the live ranking module.
import re The real implementation lives in :mod:`services.search.ranking`, which is what
import logging the search runtime (services/search/core.py) imports. This module used to hold a
from datetime import datetime, timezone parallel copy; it now re-exports so the two cannot drift out of sync again.
from typing import List, Optional """
from urllib.parse import urlparse
logger = logging.getLogger(__name__) from services.search.ranking import ( # noqa: F401
_AGE_FORMATS,
_AGE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S") _utcnow_naive,
rank_search_results,
recency_score,
def _utcnow_naive() -> datetime:
"""Naive UTC 'now'. Matches the naive, UTC-style published dates parsed below,
and is safe on Python 3.14 where ``datetime.utcnow()`` is removed (#1116)."""
return datetime.now(timezone.utc).replace(tzinfo=None)
def recency_score(age_str: Optional[str], now: Optional[datetime] = None) -> float:
"""Score how recent a result is: 1.0 for <=7 days old, 0.0 for >=30 days.
The age is measured against UTC, not local time. The previous code used
``datetime.now()`` (local) against UTC-style published dates, so the age was
skewed by the host's UTC offset; it was also a latent crash once neighbouring
code moves to timezone-aware datetimes (#1116). ``now`` is injectable for tests.
"""
if not age_str:
return 0.0
dt = None
for fmt in _AGE_FORMATS:
try:
dt = datetime.strptime(age_str, fmt)
break
except Exception:
dt = None
if not dt:
return 0.0
now = now or _utcnow_naive()
days_old = (now - dt).days
if days_old <= 7:
return 1.0
if days_old >= 30:
return 0.0
return (30 - days_old) / 23
_NEWS_HINTS = {"news", "nyheter", "headlines", "breaking", "latest", "today", "idag"}
_SPORTS_HINTS = {
"sport", "sports", "soccer", "football", "hockey", "nba", "nfl", "mlb",
"fifa", "world cup", "championship", "quarterfinal", "eliminates",
}
# Word-boundary match so "sport" does not fire inside "transport"/"passport"
# and a domain like "transport.gov" is not mistaken for a sports site.
_SPORTS_HINT_RE = re.compile(
r"\b(?:" + "|".join(re.escape(h) for h in _SPORTS_HINTS) + r")\b"
) )
_LOW_VALUE_NEWS_DOMAINS = {
"facebook.com", "www.facebook.com", "sports.yahoo.com", "yahoo.com",
"www.yahoo.com", "msn.com", "www.msn.com",
}
_TRUSTED_NEWS_DOMAINS = {
"apnews.com", "www.apnews.com", "reuters.com", "www.reuters.com",
"bbc.com", "www.bbc.com", "cbc.ca", "www.cbc.ca",
"ctvnews.ca", "www.ctvnews.ca", "globalnews.ca", "www.globalnews.ca",
"theguardian.com",
"www.theguardian.com", "euronews.com", "www.euronews.com",
"dw.com", "www.dw.com", "government.se", "www.government.se",
}
def _domain(url: str) -> str:
try:
return urlparse(url).netloc.lower()
except Exception:
return ""
def rank_search_results(query: str, results: List[dict]) -> List[dict]:
"""Rank search results by title relevance, snippet quality, domain authority, and recency."""
query_terms = [t.lower() for t in re.findall(r"\b\w+\b", query)]
query_lc = query.lower()
is_news_query = any(term in _NEWS_HINTS for term in query_terms)
is_sports_query = bool(_SPORTS_HINT_RE.search(query_lc))
def title_score(title: str) -> float:
if not title:
return 0.0
title_lc = title.lower()
matches = sum(1 for term in query_terms if re.search(rf"\b{re.escape(term)}\b", title_lc))
return matches / len(query_terms) if query_terms else 0.0
def snippet_score(snippet: str) -> float:
if not snippet:
return 0.0
length_factor = min(len(snippet), 200) / 200
term_hits = sum(1 for term in query_terms if term in snippet.lower())
term_factor = term_hits / len(query_terms) if query_terms else 0.0
return (length_factor + term_factor) / 2
def domain_score(url: str) -> float:
netloc = _domain(url)
if not netloc:
return 0.0
if netloc in _TRUSTED_NEWS_DOMAINS:
return 1.0
if netloc.endswith(".edu") or netloc.endswith(".gov"):
return 1.0
if netloc.endswith(".org"):
return 0.7
return 0.4
def news_quality_adjustment(title: str, snippet: str, url: str) -> float:
if not is_news_query:
return 0.0
text = f"{title} {snippet}".lower()
netloc = _domain(url)
adjustment = 0.0
if netloc in _TRUSTED_NEWS_DOMAINS:
adjustment += 1.2
if any(term in text for term in ("latest news", "breaking news", "daily coverage", "news from")):
adjustment += 0.4
if netloc in _LOW_VALUE_NEWS_DOMAINS:
adjustment -= 0.8
if not is_sports_query and (_SPORTS_HINT_RE.search(text) or _SPORTS_HINT_RE.search(netloc)):
adjustment -= 1.5
# A country/news query should not rank a page whose title/snippet barely
# mentions the country above actual news pages for that country.
subject_terms = [t for t in query_terms if t not in _NEWS_HINTS]
if subject_terms and not any(t in text or t in netloc for t in subject_terms):
adjustment -= 1.0
return adjustment
ranked = []
for result in results:
title = result.get("title", "")
snippet = result.get("snippet", "")
url = result.get("url", "")
age = result.get("age", None)
score = (
2.0 * title_score(title)
+ 1.0 * snippet_score(snippet)
+ 1.5 * domain_score(url)
+ 1.0 * recency_score(age)
+ news_quality_adjustment(title, snippet, url)
)
ranked.append((score, result))
ranked.sort(key=lambda x: x[0], reverse=True)
return [r for _, r in ranked]
+1 -1
View File
@@ -1,4 +1,4 @@
from src.search.ranking import rank_search_results from services.search.ranking import rank_search_results
def test_news_queries_prefer_news_sources_over_sports_and_social_results(): def test_news_queries_prefer_news_sources_over_sports_and_social_results():
+36 -1
View File
@@ -8,7 +8,8 @@ module-level, time-injectable function.
from datetime import datetime, timezone from datetime import datetime, timezone
from src.search.ranking import recency_score, _utcnow_naive import services.search.ranking as live_ranking
from services.search.ranking import recency_score, _utcnow_naive, rank_search_results
def test_fresh_result_scores_one(): def test_fresh_result_scores_one():
@@ -37,3 +38,37 @@ def test_default_now_is_naive_utc():
assert now.tzinfo is None assert now.tzinfo is None
reference = datetime.now(timezone.utc).replace(tzinfo=None) reference = datetime.now(timezone.utc).replace(tzinfo=None)
assert abs((now - reference).total_seconds()) < 5 assert abs((now - reference).total_seconds()) < 5
def test_supported_timestamp_formats_parse():
# All three formats the current implementation supports resolve to the same
# ~4-day-old age, so each scores a full 1.0.
now = datetime(2026, 1, 5, 12, 0, 0)
assert recency_score("2026-01-01", now=now) == 1.0
assert recency_score("2026-01-01T08:30:00", now=now) == 1.0
assert recency_score("2026-01-01 08:30:00", now=now) == 1.0
def test_shim_reexports_live_objects():
# src.search.ranking is a compatibility shim; it must expose the *same*
# objects as the live services module so the two cannot diverge.
import src.search.ranking as shim
assert shim.recency_score is live_ranking.recency_score
assert shim.rank_search_results is live_ranking.rank_search_results
assert shim._utcnow_naive is live_ranking._utcnow_naive
def test_live_rank_path_prefers_newer_result(monkeypatch):
# Pin "now" so age scoring is deterministic. The two results are identical
# apart from age, isolating recency as the only differentiator.
monkeypatch.setattr(live_ranking, "_utcnow_naive", lambda: datetime(2026, 1, 31))
results = [
{"title": "Report", "url": "https://example.org/a", "snippet": "x", "age": "2026-01-01"},
{"title": "Report", "url": "https://example.org/b", "snippet": "x", "age": "2026-01-29"},
]
ranked = rank_search_results("report", results)
assert ranked[0]["url"] == "https://example.org/b"
assert ranked[1]["url"] == "https://example.org/a"