mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-17 02:05:22 -04:00
fix(search): apply recency UTC fix to live ranking module
This commit is contained in:
committed by
GitHub
parent
0deeba58ba
commit
a75dd4a231
+38
-19
@@ -2,12 +2,49 @@
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
import logging
|
import logging
|
||||||
from datetime import datetime
|
from datetime import datetime, timezone
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_AGE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S")
|
||||||
|
|
||||||
|
|
||||||
|
def _utcnow_naive() -> datetime:
|
||||||
|
"""Naive UTC 'now'. Matches the naive, UTC-style published dates parsed below,
|
||||||
|
and is safe on Python 3.14 where ``datetime.utcnow()`` is removed (#1116)."""
|
||||||
|
return datetime.now(timezone.utc).replace(tzinfo=None)
|
||||||
|
|
||||||
|
|
||||||
|
def recency_score(age_str: Optional[str], now: Optional[datetime] = None) -> float:
|
||||||
|
"""Score how recent a result is: 1.0 for <=7 days old, 0.0 for >=30 days.
|
||||||
|
|
||||||
|
The age is measured against UTC, not local time. The previous code used
|
||||||
|
``datetime.now()`` (local) against UTC-style published dates, so the age was
|
||||||
|
skewed by the host's UTC offset; it was also a latent crash once neighbouring
|
||||||
|
code moves to timezone-aware datetimes (#1116). ``now`` is injectable for tests.
|
||||||
|
"""
|
||||||
|
if not age_str:
|
||||||
|
return 0.0
|
||||||
|
dt = None
|
||||||
|
for fmt in _AGE_FORMATS:
|
||||||
|
try:
|
||||||
|
dt = datetime.strptime(age_str, fmt)
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
dt = None
|
||||||
|
if not dt:
|
||||||
|
return 0.0
|
||||||
|
now = now or _utcnow_naive()
|
||||||
|
days_old = (now - dt).days
|
||||||
|
if days_old <= 7:
|
||||||
|
return 1.0
|
||||||
|
if days_old >= 30:
|
||||||
|
return 0.0
|
||||||
|
return (30 - days_old) / 23
|
||||||
|
|
||||||
|
|
||||||
_NEWS_HINTS = {"news", "nyheter", "headlines", "breaking", "latest", "today", "idag"}
|
_NEWS_HINTS = {"news", "nyheter", "headlines", "breaking", "latest", "today", "idag"}
|
||||||
_SPORTS_HINTS = {
|
_SPORTS_HINTS = {
|
||||||
"sport", "sports", "soccer", "football", "hockey", "nba", "nfl", "mlb",
|
"sport", "sports", "soccer", "football", "hockey", "nba", "nfl", "mlb",
|
||||||
@@ -73,24 +110,6 @@ def rank_search_results(query: str, results: List[dict]) -> List[dict]:
|
|||||||
return 0.7
|
return 0.7
|
||||||
return 0.4
|
return 0.4
|
||||||
|
|
||||||
def recency_score(age_str: Optional[str]) -> float:
|
|
||||||
if not age_str:
|
|
||||||
return 0.0
|
|
||||||
for fmt in ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S"):
|
|
||||||
try:
|
|
||||||
dt = datetime.strptime(age_str, fmt)
|
|
||||||
break
|
|
||||||
except Exception:
|
|
||||||
dt = None
|
|
||||||
if not dt:
|
|
||||||
return 0.0
|
|
||||||
days_old = (datetime.now() - dt).days
|
|
||||||
if days_old <= 7:
|
|
||||||
return 1.0
|
|
||||||
if days_old >= 30:
|
|
||||||
return 0.0
|
|
||||||
return (30 - days_old) / 23
|
|
||||||
|
|
||||||
def news_quality_adjustment(title: str, snippet: str, url: str) -> float:
|
def news_quality_adjustment(title: str, snippet: str, url: str) -> float:
|
||||||
if not is_news_query:
|
if not is_news_query:
|
||||||
return 0.0
|
return 0.0
|
||||||
|
|||||||
+10
-148
@@ -1,151 +1,13 @@
|
|||||||
"""Search result ranking based on relevance, source quality, and recency."""
|
"""Compatibility re-export shim for the live ranking module.
|
||||||
|
|
||||||
import re
|
The real implementation lives in :mod:`services.search.ranking`, which is what
|
||||||
import logging
|
the search runtime (services/search/core.py) imports. This module used to hold a
|
||||||
from datetime import datetime, timezone
|
parallel copy; it now re-exports so the two cannot drift out of sync again.
|
||||||
from typing import List, Optional
|
"""
|
||||||
from urllib.parse import urlparse
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
from services.search.ranking import ( # noqa: F401
|
||||||
|
_AGE_FORMATS,
|
||||||
_AGE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S")
|
_utcnow_naive,
|
||||||
|
rank_search_results,
|
||||||
|
recency_score,
|
||||||
def _utcnow_naive() -> datetime:
|
|
||||||
"""Naive UTC 'now'. Matches the naive, UTC-style published dates parsed below,
|
|
||||||
and is safe on Python 3.14 where ``datetime.utcnow()`` is removed (#1116)."""
|
|
||||||
return datetime.now(timezone.utc).replace(tzinfo=None)
|
|
||||||
|
|
||||||
|
|
||||||
def recency_score(age_str: Optional[str], now: Optional[datetime] = None) -> float:
|
|
||||||
"""Score how recent a result is: 1.0 for <=7 days old, 0.0 for >=30 days.
|
|
||||||
|
|
||||||
The age is measured against UTC, not local time. The previous code used
|
|
||||||
``datetime.now()`` (local) against UTC-style published dates, so the age was
|
|
||||||
skewed by the host's UTC offset; it was also a latent crash once neighbouring
|
|
||||||
code moves to timezone-aware datetimes (#1116). ``now`` is injectable for tests.
|
|
||||||
"""
|
|
||||||
if not age_str:
|
|
||||||
return 0.0
|
|
||||||
dt = None
|
|
||||||
for fmt in _AGE_FORMATS:
|
|
||||||
try:
|
|
||||||
dt = datetime.strptime(age_str, fmt)
|
|
||||||
break
|
|
||||||
except Exception:
|
|
||||||
dt = None
|
|
||||||
if not dt:
|
|
||||||
return 0.0
|
|
||||||
now = now or _utcnow_naive()
|
|
||||||
days_old = (now - dt).days
|
|
||||||
if days_old <= 7:
|
|
||||||
return 1.0
|
|
||||||
if days_old >= 30:
|
|
||||||
return 0.0
|
|
||||||
return (30 - days_old) / 23
|
|
||||||
|
|
||||||
|
|
||||||
_NEWS_HINTS = {"news", "nyheter", "headlines", "breaking", "latest", "today", "idag"}
|
|
||||||
_SPORTS_HINTS = {
|
|
||||||
"sport", "sports", "soccer", "football", "hockey", "nba", "nfl", "mlb",
|
|
||||||
"fifa", "world cup", "championship", "quarterfinal", "eliminates",
|
|
||||||
}
|
|
||||||
# Word-boundary match so "sport" does not fire inside "transport"/"passport"
|
|
||||||
# and a domain like "transport.gov" is not mistaken for a sports site.
|
|
||||||
_SPORTS_HINT_RE = re.compile(
|
|
||||||
r"\b(?:" + "|".join(re.escape(h) for h in _SPORTS_HINTS) + r")\b"
|
|
||||||
)
|
)
|
||||||
_LOW_VALUE_NEWS_DOMAINS = {
|
|
||||||
"facebook.com", "www.facebook.com", "sports.yahoo.com", "yahoo.com",
|
|
||||||
"www.yahoo.com", "msn.com", "www.msn.com",
|
|
||||||
}
|
|
||||||
_TRUSTED_NEWS_DOMAINS = {
|
|
||||||
"apnews.com", "www.apnews.com", "reuters.com", "www.reuters.com",
|
|
||||||
"bbc.com", "www.bbc.com", "cbc.ca", "www.cbc.ca",
|
|
||||||
"ctvnews.ca", "www.ctvnews.ca", "globalnews.ca", "www.globalnews.ca",
|
|
||||||
"theguardian.com",
|
|
||||||
"www.theguardian.com", "euronews.com", "www.euronews.com",
|
|
||||||
"dw.com", "www.dw.com", "government.se", "www.government.se",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def _domain(url: str) -> str:
|
|
||||||
try:
|
|
||||||
return urlparse(url).netloc.lower()
|
|
||||||
except Exception:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
def rank_search_results(query: str, results: List[dict]) -> List[dict]:
|
|
||||||
"""Rank search results by title relevance, snippet quality, domain authority, and recency."""
|
|
||||||
query_terms = [t.lower() for t in re.findall(r"\b\w+\b", query)]
|
|
||||||
query_lc = query.lower()
|
|
||||||
is_news_query = any(term in _NEWS_HINTS for term in query_terms)
|
|
||||||
is_sports_query = bool(_SPORTS_HINT_RE.search(query_lc))
|
|
||||||
|
|
||||||
def title_score(title: str) -> float:
|
|
||||||
if not title:
|
|
||||||
return 0.0
|
|
||||||
title_lc = title.lower()
|
|
||||||
matches = sum(1 for term in query_terms if re.search(rf"\b{re.escape(term)}\b", title_lc))
|
|
||||||
return matches / len(query_terms) if query_terms else 0.0
|
|
||||||
|
|
||||||
def snippet_score(snippet: str) -> float:
|
|
||||||
if not snippet:
|
|
||||||
return 0.0
|
|
||||||
length_factor = min(len(snippet), 200) / 200
|
|
||||||
term_hits = sum(1 for term in query_terms if term in snippet.lower())
|
|
||||||
term_factor = term_hits / len(query_terms) if query_terms else 0.0
|
|
||||||
return (length_factor + term_factor) / 2
|
|
||||||
|
|
||||||
def domain_score(url: str) -> float:
|
|
||||||
netloc = _domain(url)
|
|
||||||
if not netloc:
|
|
||||||
return 0.0
|
|
||||||
if netloc in _TRUSTED_NEWS_DOMAINS:
|
|
||||||
return 1.0
|
|
||||||
if netloc.endswith(".edu") or netloc.endswith(".gov"):
|
|
||||||
return 1.0
|
|
||||||
if netloc.endswith(".org"):
|
|
||||||
return 0.7
|
|
||||||
return 0.4
|
|
||||||
|
|
||||||
def news_quality_adjustment(title: str, snippet: str, url: str) -> float:
|
|
||||||
if not is_news_query:
|
|
||||||
return 0.0
|
|
||||||
text = f"{title} {snippet}".lower()
|
|
||||||
netloc = _domain(url)
|
|
||||||
adjustment = 0.0
|
|
||||||
if netloc in _TRUSTED_NEWS_DOMAINS:
|
|
||||||
adjustment += 1.2
|
|
||||||
if any(term in text for term in ("latest news", "breaking news", "daily coverage", "news from")):
|
|
||||||
adjustment += 0.4
|
|
||||||
if netloc in _LOW_VALUE_NEWS_DOMAINS:
|
|
||||||
adjustment -= 0.8
|
|
||||||
if not is_sports_query and (_SPORTS_HINT_RE.search(text) or _SPORTS_HINT_RE.search(netloc)):
|
|
||||||
adjustment -= 1.5
|
|
||||||
# A country/news query should not rank a page whose title/snippet barely
|
|
||||||
# mentions the country above actual news pages for that country.
|
|
||||||
subject_terms = [t for t in query_terms if t not in _NEWS_HINTS]
|
|
||||||
if subject_terms and not any(t in text or t in netloc for t in subject_terms):
|
|
||||||
adjustment -= 1.0
|
|
||||||
return adjustment
|
|
||||||
|
|
||||||
ranked = []
|
|
||||||
for result in results:
|
|
||||||
title = result.get("title", "")
|
|
||||||
snippet = result.get("snippet", "")
|
|
||||||
url = result.get("url", "")
|
|
||||||
age = result.get("age", None)
|
|
||||||
|
|
||||||
score = (
|
|
||||||
2.0 * title_score(title)
|
|
||||||
+ 1.0 * snippet_score(snippet)
|
|
||||||
+ 1.5 * domain_score(url)
|
|
||||||
+ 1.0 * recency_score(age)
|
|
||||||
+ news_quality_adjustment(title, snippet, url)
|
|
||||||
)
|
|
||||||
ranked.append((score, result))
|
|
||||||
|
|
||||||
ranked.sort(key=lambda x: x[0], reverse=True)
|
|
||||||
return [r for _, r in ranked]
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
from src.search.ranking import rank_search_results
|
from services.search.ranking import rank_search_results
|
||||||
|
|
||||||
|
|
||||||
def test_news_queries_prefer_news_sources_over_sports_and_social_results():
|
def test_news_queries_prefer_news_sources_over_sports_and_social_results():
|
||||||
|
|||||||
@@ -8,7 +8,8 @@ module-level, time-injectable function.
|
|||||||
|
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
from src.search.ranking import recency_score, _utcnow_naive
|
import services.search.ranking as live_ranking
|
||||||
|
from services.search.ranking import recency_score, _utcnow_naive, rank_search_results
|
||||||
|
|
||||||
|
|
||||||
def test_fresh_result_scores_one():
|
def test_fresh_result_scores_one():
|
||||||
@@ -37,3 +38,37 @@ def test_default_now_is_naive_utc():
|
|||||||
assert now.tzinfo is None
|
assert now.tzinfo is None
|
||||||
reference = datetime.now(timezone.utc).replace(tzinfo=None)
|
reference = datetime.now(timezone.utc).replace(tzinfo=None)
|
||||||
assert abs((now - reference).total_seconds()) < 5
|
assert abs((now - reference).total_seconds()) < 5
|
||||||
|
|
||||||
|
|
||||||
|
def test_supported_timestamp_formats_parse():
|
||||||
|
# All three formats the current implementation supports resolve to the same
|
||||||
|
# ~4-day-old age, so each scores a full 1.0.
|
||||||
|
now = datetime(2026, 1, 5, 12, 0, 0)
|
||||||
|
assert recency_score("2026-01-01", now=now) == 1.0
|
||||||
|
assert recency_score("2026-01-01T08:30:00", now=now) == 1.0
|
||||||
|
assert recency_score("2026-01-01 08:30:00", now=now) == 1.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_shim_reexports_live_objects():
|
||||||
|
# src.search.ranking is a compatibility shim; it must expose the *same*
|
||||||
|
# objects as the live services module so the two cannot diverge.
|
||||||
|
import src.search.ranking as shim
|
||||||
|
|
||||||
|
assert shim.recency_score is live_ranking.recency_score
|
||||||
|
assert shim.rank_search_results is live_ranking.rank_search_results
|
||||||
|
assert shim._utcnow_naive is live_ranking._utcnow_naive
|
||||||
|
|
||||||
|
|
||||||
|
def test_live_rank_path_prefers_newer_result(monkeypatch):
|
||||||
|
# Pin "now" so age scoring is deterministic. The two results are identical
|
||||||
|
# apart from age, isolating recency as the only differentiator.
|
||||||
|
monkeypatch.setattr(live_ranking, "_utcnow_naive", lambda: datetime(2026, 1, 31))
|
||||||
|
results = [
|
||||||
|
{"title": "Report", "url": "https://example.org/a", "snippet": "x", "age": "2026-01-01"},
|
||||||
|
{"title": "Report", "url": "https://example.org/b", "snippet": "x", "age": "2026-01-29"},
|
||||||
|
]
|
||||||
|
|
||||||
|
ranked = rank_search_results("report", results)
|
||||||
|
|
||||||
|
assert ranked[0]["url"] == "https://example.org/b"
|
||||||
|
assert ranked[1]["url"] == "https://example.org/a"
|
||||||
|
|||||||
Reference in New Issue
Block a user