mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-16 17:55:26 -04:00
Search: align service provider guards
Co-authored-by: ghreprimand <203024559+ghreprimand@users.noreply.github.com>
This commit is contained in:
@@ -76,6 +76,43 @@ def _get_result_count() -> int:
|
|||||||
return 5
|
return 5
|
||||||
|
|
||||||
|
|
||||||
|
# Canonical SafeSearch levels: "strict" (default), "moderate", "off".
|
||||||
|
# Each provider has its own knob name and value space -- see _safesearch_for(...).
|
||||||
|
_SAFESEARCH_LEVELS = ("strict", "moderate", "off")
|
||||||
|
|
||||||
|
|
||||||
|
def _get_safesearch_level() -> str:
|
||||||
|
"""Return configured SafeSearch level normalized to a canonical value."""
|
||||||
|
settings = _get_search_settings()
|
||||||
|
raw = (settings.get("search_safesearch") or "strict").strip().lower()
|
||||||
|
if raw in _SAFESEARCH_LEVELS:
|
||||||
|
return raw
|
||||||
|
aliases = {
|
||||||
|
"on": "strict", "high": "strict", "2": "strict",
|
||||||
|
"medium": "moderate", "1": "moderate", "default": "moderate",
|
||||||
|
"none": "off", "disabled": "off", "0": "off",
|
||||||
|
}
|
||||||
|
return aliases.get(raw, "strict")
|
||||||
|
|
||||||
|
|
||||||
|
def _safesearch_for(provider: str) -> Optional[str]:
|
||||||
|
"""Translate the canonical SafeSearch level into provider-specific values."""
|
||||||
|
level = _get_safesearch_level()
|
||||||
|
if provider == "searxng":
|
||||||
|
return {"strict": "2", "moderate": "1", "off": "0"}[level]
|
||||||
|
if provider == "brave":
|
||||||
|
return level
|
||||||
|
if provider == "duckduckgo_lib":
|
||||||
|
return {"strict": "on", "moderate": "moderate", "off": "off"}[level]
|
||||||
|
if provider == "duckduckgo_html":
|
||||||
|
return {"strict": "1", "moderate": "-1", "off": "-2"}[level]
|
||||||
|
if provider == "google_pse":
|
||||||
|
return None if level == "off" else "active"
|
||||||
|
if provider == "serper":
|
||||||
|
return None if level == "off" else "active"
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
# ── SearXNG ──
|
# ── SearXNG ──
|
||||||
|
|
||||||
_NEWS_HINTS = ("news", "nyheter", "headlines", "breaking", "latest", "today", "idag")
|
_NEWS_HINTS = ("news", "nyheter", "headlines", "breaking", "latest", "today", "idag")
|
||||||
@@ -105,7 +142,12 @@ def searxng_search_api(query: str, count: int = 10, categories: str = "general",
|
|||||||
# languages and brand-ambiguous terms bleed in foreign SEO pages (e.g.
|
# languages and brand-ambiguous terms bleed in foreign SEO pages (e.g.
|
||||||
# "Odyssey" → Honda Japan, "Trojan" → Japanese malware blogs, "Polyphemus"
|
# "Odyssey" → Honda Japan, "Trojan" → Japanese malware blogs, "Polyphemus"
|
||||||
# → Chinese math forums). The news path already did this; general didn't.
|
# → Chinese math forums). The news path already did this; general didn't.
|
||||||
params = {"q": query, "format": "json", "language": "en"}
|
params = {
|
||||||
|
"q": query,
|
||||||
|
"format": "json",
|
||||||
|
"language": "en",
|
||||||
|
"safesearch": _safesearch_for("searxng"),
|
||||||
|
}
|
||||||
q_lc = query.lower()
|
q_lc = query.lower()
|
||||||
is_news = time_filter is not None or any(h in q_lc for h in _NEWS_HINTS)
|
is_news = time_filter is not None or any(h in q_lc for h in _NEWS_HINTS)
|
||||||
if is_news and categories == "general":
|
if is_news and categories == "general":
|
||||||
@@ -154,6 +196,7 @@ def searxng_search_api(query: str, count: int = 10, categories: str = "general",
|
|||||||
"format": "json",
|
"format": "json",
|
||||||
"language": "en",
|
"language": "en",
|
||||||
"categories": "general",
|
"categories": "general",
|
||||||
|
"safesearch": _safesearch_for("searxng"),
|
||||||
}
|
}
|
||||||
if _GENERAL_ENGINES:
|
if _GENERAL_ENGINES:
|
||||||
fallback["engines"] = _GENERAL_ENGINES
|
fallback["engines"] = _GENERAL_ENGINES
|
||||||
@@ -204,7 +247,7 @@ def searxng_search(query, max_results=10):
|
|||||||
try:
|
try:
|
||||||
response = httpx.get(
|
response = httpx.get(
|
||||||
f"{instance}/search",
|
f"{instance}/search",
|
||||||
params={"q": query},
|
params={"q": query, "safesearch": _safesearch_for("searxng")},
|
||||||
headers=req_headers,
|
headers=req_headers,
|
||||||
timeout=10,
|
timeout=10,
|
||||||
)
|
)
|
||||||
@@ -249,7 +292,11 @@ def _brave_search_impl(query: str, count: int, time_filter: Optional[str] = None
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
headers = {"X-Subscription-Token": brave_api_key, "Accept": "application/json"}
|
headers = {"X-Subscription-Token": brave_api_key, "Accept": "application/json"}
|
||||||
params = {"q": enhanced_query, "count": count}
|
params = {
|
||||||
|
"q": enhanced_query,
|
||||||
|
"count": count,
|
||||||
|
"safesearch": _safesearch_for("brave"),
|
||||||
|
}
|
||||||
if time_filter:
|
if time_filter:
|
||||||
time_map = {"day": "day", "week": "week", "month": "month", "year": "year"}
|
time_map = {"day": "day", "week": "week", "month": "month", "year": "year"}
|
||||||
if time_filter in time_map:
|
if time_filter in time_map:
|
||||||
@@ -298,32 +345,40 @@ def _brave_search_impl(query: str, count: int, time_filter: Optional[str] = None
|
|||||||
|
|
||||||
# ── DuckDuckGo (free, no key) ──
|
# ── DuckDuckGo (free, no key) ──
|
||||||
|
|
||||||
|
def _is_duckduckgo_host(host: str) -> bool:
|
||||||
|
"""True only for duckduckgo.com and its subdomains."""
|
||||||
|
host = (host or "").lower()
|
||||||
|
return host == "duckduckgo.com" or host.endswith(".duckduckgo.com")
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_ddg_redirect(raw: str) -> str:
|
||||||
|
"""Resolve a DuckDuckGo /l/?uddg= redirect URL to its destination."""
|
||||||
|
if not raw:
|
||||||
|
return raw
|
||||||
|
resolved = raw
|
||||||
|
if resolved.startswith("//"):
|
||||||
|
resolved = "https:" + resolved
|
||||||
|
elif resolved.startswith("/"):
|
||||||
|
resolved = urljoin("https://html.duckduckgo.com", resolved)
|
||||||
|
try:
|
||||||
|
parsed = urlparse(resolved)
|
||||||
|
if _is_duckduckgo_host(parsed.hostname) and parsed.path.rstrip("/") == "/l":
|
||||||
|
qs = parse_qs(parsed.query)
|
||||||
|
if "uddg" in qs:
|
||||||
|
return qs["uddg"][0]
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return resolved
|
||||||
|
|
||||||
|
|
||||||
def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
|
def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
|
||||||
"""Search using DuckDuckGo via the duckduckgo-search library. No API key needed."""
|
"""Search using DuckDuckGo via the duckduckgo-search library. No API key needed."""
|
||||||
def _resolve_url(raw: str) -> str:
|
|
||||||
"""Resolve DuckDuckGo redirect URL to the actual destination URL."""
|
|
||||||
if not raw:
|
|
||||||
return raw
|
|
||||||
resolved = raw
|
|
||||||
if resolved.startswith("//"):
|
|
||||||
resolved = "https:" + resolved
|
|
||||||
elif resolved.startswith("/"):
|
|
||||||
resolved = urljoin("https://html.duckduckgo.com", resolved)
|
|
||||||
try:
|
|
||||||
parsed = urlparse(resolved)
|
|
||||||
if "duckduckgo.com" in (parsed.hostname or "") and parsed.path.rstrip("/") == "/l":
|
|
||||||
qs = parse_qs(parsed.query)
|
|
||||||
if "uddg" in qs:
|
|
||||||
return qs["uddg"][0]
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
return resolved
|
|
||||||
|
|
||||||
def _html_fallback() -> List[dict]:
|
def _html_fallback() -> List[dict]:
|
||||||
try:
|
try:
|
||||||
response = httpx.get(
|
response = httpx.get(
|
||||||
"https://html.duckduckgo.com/html/",
|
"https://html.duckduckgo.com/html/",
|
||||||
params={"q": query},
|
params={"q": query, "kp": _safesearch_for("duckduckgo_html")},
|
||||||
headers={"User-Agent": "Mozilla/5.0"},
|
headers={"User-Agent": "Mozilla/5.0"},
|
||||||
timeout=REQUEST_TIMEOUT,
|
timeout=REQUEST_TIMEOUT,
|
||||||
)
|
)
|
||||||
@@ -334,7 +389,7 @@ def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] =
|
|||||||
link = result.select_one(".result__a")
|
link = result.select_one(".result__a")
|
||||||
if not link:
|
if not link:
|
||||||
continue
|
continue
|
||||||
url = _resolve_url(link.get("href", ""))
|
url = _resolve_ddg_redirect(link.get("href", ""))
|
||||||
if not url:
|
if not url:
|
||||||
continue
|
continue
|
||||||
snippet_el = result.select_one(".result__snippet")
|
snippet_el = result.select_one(".result__snippet")
|
||||||
@@ -362,7 +417,12 @@ def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] =
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
ddgs = DDGS()
|
ddgs = DDGS()
|
||||||
raw = ddgs.text(query, max_results=count, timelimit=timelimit)
|
raw = ddgs.text(
|
||||||
|
query,
|
||||||
|
max_results=count,
|
||||||
|
timelimit=timelimit,
|
||||||
|
safesearch=_safesearch_for("duckduckgo_lib"),
|
||||||
|
)
|
||||||
results = []
|
results = []
|
||||||
for item in raw:
|
for item in raw:
|
||||||
url = item.get("href", "")
|
url = item.get("href", "")
|
||||||
@@ -404,6 +464,9 @@ def google_pse_search(query: str, count: int = 10, time_filter: Optional[str] =
|
|||||||
"q": query,
|
"q": query,
|
||||||
"num": min(count, 10), # Google PSE max is 10 per request
|
"num": min(count, 10), # Google PSE max is 10 per request
|
||||||
}
|
}
|
||||||
|
safe = _safesearch_for("google_pse")
|
||||||
|
if safe:
|
||||||
|
params["safe"] = safe
|
||||||
if time_filter:
|
if time_filter:
|
||||||
# dateRestrict: d[number], w[number], m[number], y[number]
|
# dateRestrict: d[number], w[number], m[number], y[number]
|
||||||
time_map = {"day": "d1", "week": "w1", "month": "m1", "year": "y1"}
|
time_map = {"day": "d1", "week": "w1", "month": "m1", "year": "y1"}
|
||||||
@@ -508,6 +571,9 @@ def serper_search(query: str, count: int = 10, time_filter: Optional[str] = None
|
|||||||
"q": query,
|
"q": query,
|
||||||
"num": count,
|
"num": count,
|
||||||
}
|
}
|
||||||
|
safe = _safesearch_for("serper")
|
||||||
|
if safe:
|
||||||
|
payload["safe"] = safe
|
||||||
if time_filter:
|
if time_filter:
|
||||||
time_map = {"day": "qdr:d", "week": "qdr:w", "month": "qdr:m", "year": "qdr:y"}
|
time_map = {"day": "qdr:d", "week": "qdr:w", "month": "qdr:m", "year": "qdr:y"}
|
||||||
if time_filter in time_map:
|
if time_filter in time_map:
|
||||||
|
|||||||
@@ -0,0 +1,101 @@
|
|||||||
|
"""Regression tests for the services.search provider copy.
|
||||||
|
|
||||||
|
The UI search routes import services.search, while agent/deep-research paths
|
||||||
|
still import src.search. Keep the service-side copy aligned with the safer
|
||||||
|
provider guards already present in src.search.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from services.search import providers
|
||||||
|
|
||||||
|
|
||||||
|
def test_service_safesearch_values_match_provider_contract(monkeypatch):
|
||||||
|
monkeypatch.setattr(providers, "_get_search_settings", lambda: {"search_safesearch": "strict"})
|
||||||
|
assert providers._safesearch_for("searxng") == "2"
|
||||||
|
assert providers._safesearch_for("brave") == "strict"
|
||||||
|
assert providers._safesearch_for("duckduckgo_lib") == "on"
|
||||||
|
assert providers._safesearch_for("duckduckgo_html") == "1"
|
||||||
|
assert providers._safesearch_for("google_pse") == "active"
|
||||||
|
assert providers._safesearch_for("serper") == "active"
|
||||||
|
|
||||||
|
monkeypatch.setattr(providers, "_get_search_settings", lambda: {"search_safesearch": "off"})
|
||||||
|
assert providers._safesearch_for("searxng") == "0"
|
||||||
|
assert providers._safesearch_for("brave") == "off"
|
||||||
|
assert providers._safesearch_for("duckduckgo_lib") == "off"
|
||||||
|
assert providers._safesearch_for("duckduckgo_html") == "-2"
|
||||||
|
assert providers._safesearch_for("google_pse") is None
|
||||||
|
assert providers._safesearch_for("serper") is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_service_searxng_json_sends_safesearch(monkeypatch):
|
||||||
|
seen = {}
|
||||||
|
|
||||||
|
class _Response:
|
||||||
|
def raise_for_status(self):
|
||||||
|
return None
|
||||||
|
|
||||||
|
def json(self):
|
||||||
|
return {
|
||||||
|
"results": [
|
||||||
|
{"title": "Result", "url": "https://example.com", "content": "Snippet"}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
def fake_get(url, **kwargs):
|
||||||
|
seen["url"] = url
|
||||||
|
seen["params"] = kwargs["params"]
|
||||||
|
return _Response()
|
||||||
|
|
||||||
|
monkeypatch.setattr(providers, "_get_search_instance", lambda: "http://searx.test")
|
||||||
|
monkeypatch.setattr(providers, "_get_search_settings", lambda: {"search_safesearch": "moderate"})
|
||||||
|
monkeypatch.setattr(providers.httpx, "get", fake_get)
|
||||||
|
|
||||||
|
results = providers.searxng_search_api("odysseus", count=1)
|
||||||
|
|
||||||
|
assert results
|
||||||
|
assert seen["url"] == "http://searx.test/search"
|
||||||
|
assert seen["params"]["safesearch"] == "1"
|
||||||
|
|
||||||
|
|
||||||
|
def test_service_ddg_redirect_ignores_lookalike_hosts():
|
||||||
|
for host in ("duckduckgo.com.evil.com", "notduckduckgo.com"):
|
||||||
|
url = f"https://{host}/l/?uddg=https%3A%2F%2Fexample.com"
|
||||||
|
assert providers._resolve_ddg_redirect(url) == url
|
||||||
|
|
||||||
|
assert providers._resolve_ddg_redirect(
|
||||||
|
"https://duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com"
|
||||||
|
) == "https://example.com"
|
||||||
|
|
||||||
|
|
||||||
|
def test_service_ddg_html_fallback_sends_safesearch(monkeypatch):
|
||||||
|
seen = {}
|
||||||
|
html = """
|
||||||
|
<html><body>
|
||||||
|
<div class="result">
|
||||||
|
<a class="result__a" href="https://notduckduckgo.com/l/?uddg=https%3A%2F%2Fevil.example">
|
||||||
|
Lookalike
|
||||||
|
</a>
|
||||||
|
<a class="result__snippet">Snippet</a>
|
||||||
|
</div>
|
||||||
|
</body></html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
class _Response:
|
||||||
|
text = html
|
||||||
|
|
||||||
|
def raise_for_status(self):
|
||||||
|
return None
|
||||||
|
|
||||||
|
def fake_get(url, **kwargs):
|
||||||
|
seen["params"] = kwargs["params"]
|
||||||
|
return _Response()
|
||||||
|
|
||||||
|
monkeypatch.setitem(sys.modules, "duckduckgo_search", None)
|
||||||
|
monkeypatch.setattr(providers, "_get_search_settings", lambda: {"search_safesearch": "off"})
|
||||||
|
monkeypatch.setattr(providers.httpx, "get", fake_get)
|
||||||
|
|
||||||
|
results = providers.duckduckgo_search("odysseus", count=1)
|
||||||
|
|
||||||
|
assert seen["params"]["kp"] == "-2"
|
||||||
|
assert results[0]["url"].startswith("https://notduckduckgo.com/")
|
||||||
Reference in New Issue
Block a user