mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-16 17:55:26 -04:00
Odysseus v1.0
This commit is contained in:
@@ -0,0 +1,35 @@
|
||||
"""Search service — web search with SearXNG."""
|
||||
|
||||
from .core import (
|
||||
comprehensive_web_search,
|
||||
get_search_config,
|
||||
invalidate_search_cache,
|
||||
searxng_search_results,
|
||||
update_search_config,
|
||||
)
|
||||
from .content import fetch_webpage_content
|
||||
from .providers import searxng_search, searxng_search_api, PROVIDER_INFO
|
||||
from .analytics import get_search_stats, SearchEngineError, NetworkError, ParseError, RateLimitError
|
||||
from .service import SearchService, SearchResult, SearchResponse
|
||||
|
||||
__all__ = [
|
||||
# Service interface (preferred)
|
||||
"SearchService",
|
||||
"SearchResult",
|
||||
"SearchResponse",
|
||||
# Low-level functions (for backwards compat)
|
||||
"comprehensive_web_search",
|
||||
"fetch_webpage_content",
|
||||
"get_search_config",
|
||||
"get_search_stats",
|
||||
"invalidate_search_cache",
|
||||
"searxng_search",
|
||||
"searxng_search_api",
|
||||
"searxng_search_results",
|
||||
"update_search_config",
|
||||
"PROVIDER_INFO",
|
||||
"SearchEngineError",
|
||||
"NetworkError",
|
||||
"ParseError",
|
||||
"RateLimitError",
|
||||
]
|
||||
@@ -0,0 +1,136 @@
|
||||
"""Search analytics, metrics tracking, and exception hierarchy."""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
|
||||
from .cache import cache_metrics
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Dedicated error logger with file handler
|
||||
_error_log_path = Path(__file__).resolve().parent.parent / "search_engine_error.log"
|
||||
_error_handler = logging.FileHandler(_error_log_path, encoding="utf-8")
|
||||
_error_handler.setLevel(logging.WARNING)
|
||||
_error_handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(name)s %(message)s"))
|
||||
error_logger = logging.getLogger("search_engine_error")
|
||||
error_logger.addHandler(_error_handler)
|
||||
error_logger.propagate = False
|
||||
|
||||
# Analytics file
|
||||
ANALYTICS_FILE = Path(__file__).resolve().parent.parent / "search_analytics.json"
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Custom exception hierarchy
|
||||
# ----------------------------------------------------------------------
|
||||
class SearchEngineError(Exception):
|
||||
"""Base class for all search-engine related errors."""
|
||||
|
||||
|
||||
class NetworkError(SearchEngineError):
|
||||
"""Raised when a network request fails (e.g., timeout, DNS error)."""
|
||||
|
||||
|
||||
class ParseError(SearchEngineError):
|
||||
"""Raised when HTML or other content cannot be parsed."""
|
||||
|
||||
|
||||
class RateLimitError(SearchEngineError):
|
||||
"""Raised when the remote service returns a rate-limit (HTTP 429)."""
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Analytics helpers
|
||||
# ----------------------------------------------------------------------
|
||||
def _load_analytics() -> Dict[str, Any]:
|
||||
"""Load analytics data from the JSON file, creating defaults if missing."""
|
||||
if not ANALYTICS_FILE.exists():
|
||||
default = {
|
||||
"total_queries": 0,
|
||||
"successful_queries": 0,
|
||||
"failed_queries": 0,
|
||||
"cache_hits": 0,
|
||||
"cache_misses": 0,
|
||||
"query_patterns": {},
|
||||
}
|
||||
_save_analytics(default)
|
||||
return default
|
||||
try:
|
||||
with open(ANALYTICS_FILE, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load analytics file: {e}")
|
||||
return {
|
||||
"total_queries": 0,
|
||||
"successful_queries": 0,
|
||||
"failed_queries": 0,
|
||||
"cache_hits": 0,
|
||||
"cache_misses": 0,
|
||||
"query_patterns": {},
|
||||
}
|
||||
|
||||
|
||||
def _save_analytics(data: Dict[str, Any]) -> None:
|
||||
"""Persist analytics data to the JSON file."""
|
||||
try:
|
||||
with open(ANALYTICS_FILE, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, indent=2)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to write analytics file: {e}")
|
||||
|
||||
|
||||
def _record_query(query: str, success: bool, cache_hit: bool) -> None:
|
||||
"""Update analytics for a single query execution."""
|
||||
analytics = _load_analytics()
|
||||
analytics["total_queries"] += 1
|
||||
if success:
|
||||
analytics["successful_queries"] += 1
|
||||
else:
|
||||
analytics["failed_queries"] += 1
|
||||
|
||||
if cache_hit:
|
||||
analytics["cache_hits"] += 1
|
||||
cache_metrics["hits"] += 1
|
||||
else:
|
||||
analytics["cache_misses"] += 1
|
||||
cache_metrics["misses"] += 1
|
||||
|
||||
patterns = analytics["query_patterns"]
|
||||
entry = patterns.get(query, {"count": 0, "successes": 0})
|
||||
entry["count"] += 1
|
||||
if success:
|
||||
entry["successes"] += 1
|
||||
patterns[query] = entry
|
||||
|
||||
_save_analytics(analytics)
|
||||
|
||||
|
||||
def get_search_stats() -> Dict[str, Any]:
|
||||
"""Return aggregated search analytics."""
|
||||
analytics = _load_analytics()
|
||||
total = analytics.get("total_queries", 0) or 1
|
||||
success_rate = analytics.get("successful_queries", 0) / total
|
||||
cache_total = analytics.get("cache_hits", 0) + analytics.get("cache_misses", 0) or 1
|
||||
cache_hit_rate = analytics.get("cache_hits", 0) / cache_total
|
||||
|
||||
pattern_counter = Counter({
|
||||
q: data["count"] for q, data in analytics.get("query_patterns", {}).items()
|
||||
})
|
||||
most_common = [q for q, _ in pattern_counter.most_common(5)]
|
||||
|
||||
return {
|
||||
"most_common_queries": most_common,
|
||||
"success_rate": success_rate,
|
||||
"cache_hit_rate": cache_hit_rate,
|
||||
"total_queries": analytics.get("total_queries", 0),
|
||||
"successful_queries": analytics.get("successful_queries", 0),
|
||||
"failed_queries": analytics.get("failed_queries", 0),
|
||||
"cache_hits": analytics.get("cache_hits", 0),
|
||||
"cache_misses": analytics.get("cache_misses", 0),
|
||||
"cache_evictions": cache_metrics["evictions"],
|
||||
"runtime_cache_hits": cache_metrics["hits"],
|
||||
"runtime_cache_misses": cache_metrics["misses"],
|
||||
}
|
||||
@@ -0,0 +1,57 @@
|
||||
"""Search and content caching with LRU eviction."""
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Dict
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Cache directories
|
||||
CACHE_DIR = Path(__file__).resolve().parent.parent / "cache"
|
||||
SEARCH_CACHE_DIR = CACHE_DIR / "search"
|
||||
CONTENT_CACHE_DIR = CACHE_DIR / "content"
|
||||
CACHE_MAX_ENTRIES = 1000
|
||||
|
||||
# Create cache directories
|
||||
SEARCH_CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
||||
CONTENT_CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Track cache size for LRU eviction
|
||||
search_cache_index: Dict[str, datetime] = {}
|
||||
content_cache_index: Dict[str, datetime] = {}
|
||||
|
||||
# Cache metrics (shared across modules)
|
||||
cache_metrics = {"hits": 0, "misses": 0, "evictions": 0}
|
||||
|
||||
|
||||
def generate_cache_key(data: str) -> str:
|
||||
"""Generate a unique cache key using SHA-256 hash."""
|
||||
return hashlib.sha256(data.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
def cleanup_cache(cache_dir: Path, cache_index: Dict[str, datetime], max_age: timedelta):
|
||||
"""Remove expired cache entries and enforce LRU policy."""
|
||||
current_time = datetime.now()
|
||||
files_in_dir = {f.name.split(".")[0]: f for f in cache_dir.glob("*.cache")}
|
||||
|
||||
to_remove = []
|
||||
for key, timestamp in list(cache_index.items()):
|
||||
if current_time - timestamp > max_age or key not in files_in_dir:
|
||||
to_remove.append(key)
|
||||
if key in files_in_dir:
|
||||
files_in_dir[key].unlink(missing_ok=True)
|
||||
|
||||
for key in to_remove:
|
||||
cache_index.pop(key, None)
|
||||
cache_metrics["evictions"] += 1
|
||||
|
||||
if len(cache_index) > CACHE_MAX_ENTRIES:
|
||||
sorted_items = sorted(cache_index.items(), key=lambda x: x[1])
|
||||
excess_count = len(cache_index) - CACHE_MAX_ENTRIES
|
||||
for key, _ in sorted_items[:excess_count]:
|
||||
cache_index.pop(key, None)
|
||||
cache_file = cache_dir / f"{key}.cache"
|
||||
cache_file.unlink(missing_ok=True)
|
||||
cache_metrics["evictions"] += 1
|
||||
@@ -0,0 +1,360 @@
|
||||
"""Webpage content fetching with caching, PDF extraction, and summarization helpers."""
|
||||
|
||||
import io
|
||||
import ipaddress
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import logging
|
||||
import socket
|
||||
from datetime import datetime, timedelta
|
||||
from typing import List
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .analytics import RateLimitError, error_logger
|
||||
from .cache import (
|
||||
CONTENT_CACHE_DIR,
|
||||
content_cache_index,
|
||||
generate_cache_key,
|
||||
cleanup_cache,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_PRIVATE_NETWORKS = (
|
||||
ipaddress.ip_network("0.0.0.0/8"),
|
||||
ipaddress.ip_network("10.0.0.0/8"),
|
||||
ipaddress.ip_network("127.0.0.0/8"),
|
||||
ipaddress.ip_network("169.254.0.0/16"),
|
||||
ipaddress.ip_network("172.16.0.0/12"),
|
||||
ipaddress.ip_network("192.168.0.0/16"),
|
||||
ipaddress.ip_network("::1/128"),
|
||||
ipaddress.ip_network("fc00::/7"),
|
||||
ipaddress.ip_network("fe80::/10"),
|
||||
)
|
||||
|
||||
|
||||
def _is_private_address(addr: ipaddress._BaseAddress) -> bool:
|
||||
return addr.is_private or addr.is_loopback or addr.is_link_local or any(addr in net for net in _PRIVATE_NETWORKS)
|
||||
|
||||
|
||||
def _resolve_hostname_ips(hostname: str) -> list[ipaddress._BaseAddress]:
|
||||
try:
|
||||
infos = socket.getaddrinfo(hostname, None)
|
||||
except Exception:
|
||||
return []
|
||||
out = []
|
||||
for info in infos:
|
||||
try:
|
||||
out.append(ipaddress.ip_address(info[4][0]))
|
||||
except Exception:
|
||||
continue
|
||||
return out
|
||||
|
||||
|
||||
def _public_http_url(url: str) -> bool:
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
if parsed.scheme not in ("http", "https"):
|
||||
return False
|
||||
host = (parsed.hostname or "").strip()
|
||||
if not host:
|
||||
return False
|
||||
lower = host.lower()
|
||||
if lower in ("localhost", "metadata", "metadata.google.internal"):
|
||||
return False
|
||||
if lower.endswith((".local", ".localhost", ".internal", ".lan", ".intranet")):
|
||||
return False
|
||||
try:
|
||||
return not _is_private_address(ipaddress.ip_address(host))
|
||||
except ValueError:
|
||||
pass
|
||||
addrs = _resolve_hostname_ips(host)
|
||||
return bool(addrs) and not any(_is_private_address(a) for a in addrs)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _get_public_url(url: str, headers: dict, timeout: int, max_redirects: int = 5) -> httpx.Response:
|
||||
current = url
|
||||
for _ in range(max_redirects + 1):
|
||||
if not _public_http_url(current):
|
||||
raise httpx.RequestError("Blocked private/internal URL", request=httpx.Request("GET", current))
|
||||
response = httpx.get(current, headers=headers, timeout=timeout, follow_redirects=False)
|
||||
if response.status_code not in (301, 302, 303, 307, 308):
|
||||
return response
|
||||
location = response.headers.get("location")
|
||||
if not location:
|
||||
return response
|
||||
current = urljoin(str(response.url), location)
|
||||
raise httpx.RequestError("Too many redirects", request=httpx.Request("GET", current))
|
||||
|
||||
# PDF extraction (optional dependency)
|
||||
try:
|
||||
from pdfminer.high_level import extract_text as pdf_extract_text
|
||||
except ImportError:
|
||||
pdf_extract_text = None # type: ignore
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# HTML extraction helpers
|
||||
# ----------------------------------------------------------------------
|
||||
def _extract_meta(soup: BeautifulSoup) -> dict:
|
||||
"""Pull meta description and keywords if present."""
|
||||
description = ""
|
||||
keywords = ""
|
||||
desc_tag = soup.find("meta", attrs={"name": re.compile("description", re.I)})
|
||||
if desc_tag and desc_tag.get("content"):
|
||||
description = desc_tag["content"].strip()
|
||||
kw_tag = soup.find("meta", attrs={"name": re.compile("keywords", re.I)})
|
||||
if kw_tag and kw_tag.get("content"):
|
||||
keywords = kw_tag["content"].strip()
|
||||
return {"description": description, "keywords": keywords}
|
||||
|
||||
|
||||
def _extract_lists(soup: BeautifulSoup) -> List[List[str]]:
|
||||
"""Return a list of lists, each inner list representing a <ul>/<ol>."""
|
||||
all_lists = []
|
||||
for lst in soup.find_all(["ul", "ol"]):
|
||||
items = [li.get_text(separator=" ", strip=True) for li in lst.find_all("li")]
|
||||
if items:
|
||||
all_lists.append(items)
|
||||
return all_lists
|
||||
|
||||
|
||||
def _extract_tables(soup: BeautifulSoup) -> List[List[List[str]]]:
|
||||
"""Return a list of tables, each table is a list of rows, each row a list of cell texts."""
|
||||
tables_data = []
|
||||
for table in soup.find_all("table"):
|
||||
rows = []
|
||||
for tr in table.find_all("tr"):
|
||||
cells = [td.get_text(separator=" ", strip=True) for td in tr.find_all(["td", "th"])]
|
||||
if cells:
|
||||
rows.append(cells)
|
||||
if rows:
|
||||
tables_data.append(rows)
|
||||
return tables_data
|
||||
|
||||
|
||||
def _extract_code_blocks(soup: BeautifulSoup) -> List[str]:
|
||||
"""Collect text from <pre> and <code> blocks."""
|
||||
blocks = []
|
||||
for tag in soup.find_all(["pre", "code"]):
|
||||
txt = tag.get_text(separator=" ", strip=True)
|
||||
if txt:
|
||||
blocks.append(txt)
|
||||
return blocks
|
||||
|
||||
|
||||
def _detect_js_frameworks(soup: BeautifulSoup) -> bool:
|
||||
"""Very naive detection of common JS frameworks."""
|
||||
js_indicators = [
|
||||
"react", "angular", "vue", "svelte", "next", "nuxt",
|
||||
"ember", "backbone", "jquery", "polymer", "mithril",
|
||||
]
|
||||
for script in soup.find_all("script"):
|
||||
src = script.get("src", "").lower()
|
||||
if any(fr in src for fr in js_indicators):
|
||||
return True
|
||||
if script.string:
|
||||
content = script.string.lower()
|
||||
if any(fr in content for fr in js_indicators):
|
||||
return True
|
||||
if soup.find(attrs={"data-reactroot": True}) or soup.find(attrs={"ng-app": True}):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _empty_result(url: str, error: str = "") -> dict:
|
||||
"""Build a standard failure result dict."""
|
||||
return {
|
||||
"url": url,
|
||||
"title": "",
|
||||
"content": "",
|
||||
"lists": [],
|
||||
"tables": [],
|
||||
"code_blocks": [],
|
||||
"meta_description": "",
|
||||
"meta_keywords": "",
|
||||
"js_rendered": False,
|
||||
"js_message": "",
|
||||
"success": False,
|
||||
"error": error,
|
||||
}
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Main content fetcher
|
||||
# ----------------------------------------------------------------------
|
||||
def fetch_webpage_content(url: str, timeout: int = 5, retry_attempt: int = 0) -> dict:
|
||||
"""Fetch and extract meaningful content from a webpage with caching."""
|
||||
cache_key = generate_cache_key(url)
|
||||
cache_file = CONTENT_CACHE_DIR / f"{cache_key}.cache"
|
||||
|
||||
# Check cache
|
||||
if cache_file.exists():
|
||||
try:
|
||||
with open(cache_file, "r", encoding="utf-8") as f:
|
||||
cached_data = json.load(f)
|
||||
timestamp = datetime.fromisoformat(cached_data["timestamp"])
|
||||
if datetime.now() - timestamp < timedelta(hours=2):
|
||||
logger.debug(f"Content cache hit for URL: {url}")
|
||||
return cached_data["data"]
|
||||
else:
|
||||
cache_file.unlink(missing_ok=True)
|
||||
content_cache_index.pop(cache_key, None)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to read content cache for {url}: {e}")
|
||||
cache_file.unlink(missing_ok=True)
|
||||
content_cache_index.pop(cache_key, None)
|
||||
|
||||
# Fetch
|
||||
try:
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.5",
|
||||
"Accept-Encoding": "gzip, deflate",
|
||||
"Connection": "keep-alive",
|
||||
}
|
||||
response = _get_public_url(url, headers=headers, timeout=timeout)
|
||||
|
||||
if response.status_code == 429:
|
||||
raise RateLimitError(f"Rate limit hit for {url} (attempt {retry_attempt})")
|
||||
|
||||
response.raise_for_status()
|
||||
except httpx.RequestError as e:
|
||||
error_logger.error(f"NetworkError fetching {url} (attempt {retry_attempt}): {e}")
|
||||
return _empty_result(url, f"NetworkError: {e}")
|
||||
except RateLimitError as e:
|
||||
error_logger.error(str(e))
|
||||
return _empty_result(url, str(e))
|
||||
|
||||
# PDF handling
|
||||
content_type = response.headers.get("Content-Type", "").lower()
|
||||
if "application/pdf" in content_type or url.lower().endswith(".pdf"):
|
||||
if pdf_extract_text is None:
|
||||
logger.error("pdfminer.six is not installed; cannot extract PDF text.")
|
||||
pdf_text = ""
|
||||
else:
|
||||
try:
|
||||
pdf_bytes = io.BytesIO(response.content)
|
||||
pdf_text = pdf_extract_text(pdf_bytes)
|
||||
except Exception as e:
|
||||
logger.warning(f"PDF extraction failed for {url}: {e}")
|
||||
pdf_text = ""
|
||||
result = {
|
||||
"url": url,
|
||||
"title": os.path.basename(url),
|
||||
"content": pdf_text,
|
||||
"lists": [],
|
||||
"tables": [],
|
||||
"code_blocks": [],
|
||||
"meta_description": "",
|
||||
"meta_keywords": "",
|
||||
"js_rendered": False,
|
||||
"js_message": "",
|
||||
"success": bool(pdf_text),
|
||||
"error": "" if pdf_text else "Failed to extract PDF text",
|
||||
}
|
||||
_cache_result(cache_file, cache_key, result, url)
|
||||
return result
|
||||
|
||||
# HTML handling
|
||||
try:
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
except Exception as e:
|
||||
error_logger.error(f"ParseError parsing HTML from {url} (attempt {retry_attempt}): {e}")
|
||||
result = _empty_result(url, f"ParseError: {e}")
|
||||
_cache_result(cache_file, cache_key, result, url)
|
||||
return result
|
||||
|
||||
title_tag = soup.find("title")
|
||||
title_text = title_tag.get_text(strip=True) if title_tag else ""
|
||||
meta_info = _extract_meta(soup)
|
||||
js_rendered = _detect_js_frameworks(soup)
|
||||
js_message = "Page appears to be rendered by a JavaScript framework; content may be incomplete." if js_rendered else ""
|
||||
|
||||
# Main textual content (heuristic)
|
||||
main_content = ""
|
||||
content_areas = soup.find_all(
|
||||
["main", "article", "section", "div"],
|
||||
class_=re.compile("content|main|body|article|post|entry|text", re.I),
|
||||
)
|
||||
if content_areas:
|
||||
for area in content_areas[:3]:
|
||||
main_content += area.get_text(separator=" ", strip=True) + " "
|
||||
if not main_content:
|
||||
body = soup.find("body")
|
||||
if body:
|
||||
main_content = body.get_text(separator=" ", strip=True)
|
||||
|
||||
main_content = re.sub(r"\s+", " ", main_content).strip()[:8000]
|
||||
|
||||
result = {
|
||||
"url": url,
|
||||
"title": title_text,
|
||||
"content": main_content,
|
||||
"lists": _extract_lists(soup),
|
||||
"tables": _extract_tables(soup),
|
||||
"code_blocks": _extract_code_blocks(soup),
|
||||
"meta_description": meta_info.get("description", ""),
|
||||
"meta_keywords": meta_info.get("keywords", ""),
|
||||
"js_rendered": js_rendered,
|
||||
"js_message": js_message,
|
||||
"success": True,
|
||||
"error": "",
|
||||
}
|
||||
_cache_result(cache_file, cache_key, result, url)
|
||||
return result
|
||||
|
||||
|
||||
def _cache_result(cache_file, cache_key: str, result: dict, url: str):
|
||||
"""Write a result to the content cache."""
|
||||
try:
|
||||
cache_data = {"timestamp": datetime.now().isoformat(), "data": result}
|
||||
with open(cache_file, "w", encoding="utf-8") as f:
|
||||
json.dump(cache_data, f)
|
||||
content_cache_index[cache_key] = datetime.now()
|
||||
cleanup_cache(CONTENT_CACHE_DIR, content_cache_index, timedelta(hours=2))
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to write content cache for {url}: {e}")
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Content summarization helpers
|
||||
# ----------------------------------------------------------------------
|
||||
def extract_key_points(text: str) -> List[str]:
|
||||
"""Pull out bullet-style key points from a block of text."""
|
||||
points: List[str] = []
|
||||
bullet_pat = re.compile(r"^\s*[-*•]\s+(.*)")
|
||||
numbered_pat = re.compile(r"^\s*\d+[\.\)]\s+(.*)")
|
||||
for line in text.splitlines():
|
||||
m = bullet_pat.match(line) or numbered_pat.match(line)
|
||||
if m:
|
||||
points.append(m.group(1).strip())
|
||||
return points
|
||||
|
||||
|
||||
def get_tldr(text: str, max_sentences: int = 3) -> str:
|
||||
"""Produce a very short TL;DR by taking the first few sentences."""
|
||||
sentences = re.split(r"(?<=[.!?])\s+", text)
|
||||
selected = [s.strip() for s in sentences if s][:max_sentences]
|
||||
return " ".join(selected)
|
||||
|
||||
|
||||
def extract_quotes(text: str) -> List[str]:
|
||||
"""Return quoted excerpts that are at least 15 characters long."""
|
||||
return [m.group(1).strip() for m in re.finditer(r'["\']([^"\']{15,}?)["\']', text)]
|
||||
|
||||
|
||||
def extract_statistics(text: str) -> List[str]:
|
||||
"""Find numbers, percentages, dates and simple measurements."""
|
||||
pattern = re.compile(
|
||||
r"\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\s*(%|percent|‰|per cent|[a-zA-Z]+)?\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
return [m.group(0).strip() for m in pattern.finditer(text)]
|
||||
@@ -0,0 +1,433 @@
|
||||
"""Core search orchestrators: searxng_search_results, comprehensive_web_search, config, cache invalidation."""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, Any, Optional, List, Set
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from .analytics import (
|
||||
NetworkError,
|
||||
ParseError,
|
||||
RateLimitError,
|
||||
error_logger,
|
||||
_record_query,
|
||||
)
|
||||
from .cache import (
|
||||
SEARCH_CACHE_DIR,
|
||||
search_cache_index,
|
||||
generate_cache_key,
|
||||
cleanup_cache,
|
||||
)
|
||||
from .query import _cache_duration_for_query
|
||||
from .ranking import rank_search_results
|
||||
from .providers import (
|
||||
searxng_search_api,
|
||||
brave_search,
|
||||
duckduckgo_search,
|
||||
google_pse_search,
|
||||
tavily_search,
|
||||
serper_search,
|
||||
_get_search_settings,
|
||||
_get_result_count,
|
||||
)
|
||||
from .content import (
|
||||
fetch_webpage_content,
|
||||
extract_key_points,
|
||||
get_tldr,
|
||||
extract_quotes,
|
||||
extract_statistics,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ========= CONFIG =========
|
||||
SEARCH_CONFIG: Dict[str, Any] = {
|
||||
"primary_provider": "searxng",
|
||||
}
|
||||
|
||||
|
||||
def get_search_config() -> Dict[str, Any]:
|
||||
"""Get current search configuration including active provider info."""
|
||||
config = SEARCH_CONFIG.copy()
|
||||
settings = _get_search_settings()
|
||||
provider = settings.get("search_provider", "searxng")
|
||||
config["active_provider"] = provider
|
||||
config["has_api_key"] = bool((settings.get("search_api_key") or "").strip())
|
||||
config["result_count"] = _get_result_count()
|
||||
if provider == "searxng":
|
||||
from .providers import _get_search_instance
|
||||
config["search_url"] = _get_search_instance()
|
||||
return config
|
||||
|
||||
|
||||
def update_search_config(api_key: str = None, **kwargs):
|
||||
"""Update search configuration (e.g. Brave API key)."""
|
||||
if api_key:
|
||||
SEARCH_CONFIG["brave_api_key"] = api_key
|
||||
|
||||
|
||||
def _call_provider(provider_name: str, query: str, count: int, time_filter: str = None) -> List[dict]:
|
||||
"""Call a search provider by name. Returns list of results or empty list."""
|
||||
if provider_name == "searxng":
|
||||
return searxng_search_api(query, count, time_filter=time_filter)
|
||||
elif provider_name == "brave":
|
||||
return brave_search(query, count, time_filter)
|
||||
elif provider_name == "duckduckgo":
|
||||
return duckduckgo_search(query, count, time_filter)
|
||||
elif provider_name == "google_pse":
|
||||
return google_pse_search(query, count, time_filter)
|
||||
elif provider_name == "tavily":
|
||||
return tavily_search(query, count, time_filter)
|
||||
elif provider_name == "serper":
|
||||
return serper_search(query, count, time_filter)
|
||||
return []
|
||||
|
||||
|
||||
# If the self-hosted SearXNG instance is up but all enabled engines return
|
||||
# empty, fall back to the no-key provider so "search X" still works on fresh
|
||||
# installs. Users can override/disable with `search_fallback_chain`.
|
||||
_FALLBACK_ORDER = ["duckduckgo"]
|
||||
|
||||
|
||||
def _build_provider_chain(primary: str) -> List[str]:
|
||||
"""Build ordered list: primary first, then configured/default fallbacks."""
|
||||
chain = [primary]
|
||||
settings = _get_search_settings()
|
||||
user_chain = settings.get("search_fallback_chain") or []
|
||||
if isinstance(user_chain, str):
|
||||
user_chain = [s.strip() for s in user_chain.split(",") if s.strip()]
|
||||
fallbacks = user_chain if user_chain else _FALLBACK_ORDER
|
||||
for fb in fallbacks:
|
||||
if fb and fb != primary and fb not in chain and fb != "disabled":
|
||||
chain.append(fb)
|
||||
return chain
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Unified search with caching and retry
|
||||
# ----------------------------------------------------------------------
|
||||
def searxng_search_results(query: str, count: int = 10, time_filter: str = None) -> list[dict]:
|
||||
"""Perform a web search using configured provider with caching and retry."""
|
||||
settings = _get_search_settings()
|
||||
search_provider = settings.get("search_provider", "searxng")
|
||||
result_count = _get_result_count()
|
||||
# Use configured count if caller used default
|
||||
if count == 10:
|
||||
count = result_count
|
||||
|
||||
cache_key = generate_cache_key(f"{query}|{count}|{time_filter}")
|
||||
cache_file = SEARCH_CACHE_DIR / f"{cache_key}.cache"
|
||||
|
||||
# Check cache
|
||||
if cache_file.exists():
|
||||
try:
|
||||
with open(cache_file, "r", encoding="utf-8") as f:
|
||||
cached_data = json.load(f)
|
||||
expiry_raw = cached_data.get("expiry")
|
||||
expiry = datetime.fromisoformat(expiry_raw) if expiry_raw else None
|
||||
if expiry and datetime.now() < expiry:
|
||||
logger.debug(f"Search cache hit for query: {query}")
|
||||
results = cached_data["data"]
|
||||
_record_query(query, bool(results), cache_hit=True)
|
||||
return results
|
||||
else:
|
||||
cache_file.unlink(missing_ok=True)
|
||||
search_cache_index.pop(cache_key, None)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to read search cache for {query}: {e}")
|
||||
cache_file.unlink(missing_ok=True)
|
||||
search_cache_index.pop(cache_key, None)
|
||||
|
||||
logger.debug(f"Search cache miss for query: {query}")
|
||||
|
||||
if search_provider == "disabled":
|
||||
logger.info("Search is disabled via admin settings")
|
||||
return []
|
||||
|
||||
provider_chain = _build_provider_chain(search_provider)
|
||||
|
||||
results: List[dict] = []
|
||||
for provider_name in provider_chain:
|
||||
for attempt in range(2):
|
||||
try:
|
||||
logger.info(f"Attempting {provider_name} search (attempt {attempt + 1})")
|
||||
results = _call_provider(provider_name, query, count, time_filter)
|
||||
if results:
|
||||
logger.info(f"{provider_name} search succeeded with {len(results)} results")
|
||||
break
|
||||
except (NetworkError, ParseError, RateLimitError) as e:
|
||||
error_logger.error(f"{provider_name} search error (attempt {attempt + 1}): {e}")
|
||||
except Exception as e:
|
||||
error_logger.error(f"Unexpected error during {provider_name} search (attempt {attempt + 1}): {e}")
|
||||
if results:
|
||||
break
|
||||
|
||||
success = bool(results)
|
||||
_record_query(query, success, cache_hit=False)
|
||||
|
||||
if success:
|
||||
results = rank_search_results(query, results)
|
||||
try:
|
||||
expiry = datetime.now() + _cache_duration_for_query(query)
|
||||
cache_data = {
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"expiry": expiry.isoformat(),
|
||||
"data": results,
|
||||
}
|
||||
with open(cache_file, "w", encoding="utf-8") as f:
|
||||
json.dump(cache_data, f)
|
||||
search_cache_index[cache_key] = datetime.now()
|
||||
cleanup_cache(SEARCH_CACHE_DIR, search_cache_index, timedelta(hours=1))
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to write search cache for {query}: {e}")
|
||||
|
||||
if not success:
|
||||
logger.error(f"All search providers failed for query: {query}")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Cache invalidation
|
||||
# ----------------------------------------------------------------------
|
||||
def invalidate_search_cache(query: Optional[str] = None) -> None:
|
||||
"""Invalidate cached search results. None clears all, otherwise just the given query."""
|
||||
if query is None:
|
||||
for file in SEARCH_CACHE_DIR.glob("*.cache"):
|
||||
try:
|
||||
file.unlink(missing_ok=True)
|
||||
except Exception as e:
|
||||
error_logger.warning(f"Failed to delete cache file {file}: {e}")
|
||||
search_cache_index.clear()
|
||||
logger.info("All search cache entries have been cleared.")
|
||||
else:
|
||||
cache_key = generate_cache_key(f"{query}|10|None")
|
||||
cache_file = SEARCH_CACHE_DIR / f"{cache_key}.cache"
|
||||
if cache_file.exists():
|
||||
try:
|
||||
cache_file.unlink(missing_ok=True)
|
||||
search_cache_index.pop(cache_key, None)
|
||||
logger.info(f"Cache entry for query '{query}' has been invalidated.")
|
||||
except Exception as e:
|
||||
error_logger.warning(f"Failed to delete cache file for query '{query}': {e}")
|
||||
else:
|
||||
logger.info(f"No cache entry found for query '{query}'.")
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Comprehensive web search (with advanced filtering)
|
||||
# ----------------------------------------------------------------------
|
||||
def comprehensive_web_search(
|
||||
query: str,
|
||||
max_pages: int = 3,
|
||||
max_workers: int = 4,
|
||||
time_filter: str = None,
|
||||
domain_whitelist: Optional[Set[str]] = None,
|
||||
domain_blacklist: Optional[Set[str]] = None,
|
||||
content_type: Optional[str] = None,
|
||||
language: Optional[str] = None,
|
||||
min_content_length: int = 0,
|
||||
return_sources: bool = False,
|
||||
):
|
||||
"""Perform comprehensive web search with content fetching and advanced filtering."""
|
||||
logger.info(f"Starting comprehensive search for: {query}")
|
||||
if time_filter:
|
||||
logger.info(f"Applying time filter: {time_filter}")
|
||||
|
||||
settings = _get_search_settings()
|
||||
search_provider = settings.get("search_provider", "searxng")
|
||||
result_count = _get_result_count()
|
||||
|
||||
if search_provider == "disabled":
|
||||
logger.info("Search is disabled via admin settings")
|
||||
msg = "Web search is disabled by the administrator."
|
||||
return (msg, []) if return_sources else msg
|
||||
|
||||
# Use configured result count (at least max_pages for content fetching)
|
||||
fetch_count = max(result_count, max_pages)
|
||||
|
||||
provider_chain = _build_provider_chain(search_provider)
|
||||
|
||||
search_results = []
|
||||
provider_attempts = {}
|
||||
for provider_name in provider_chain:
|
||||
last_err = None
|
||||
empty = False
|
||||
for attempt in range(2):
|
||||
try:
|
||||
search_results = _call_provider(provider_name, query, fetch_count, time_filter)
|
||||
if search_results:
|
||||
provider_attempts[provider_name] = f"ok ({len(search_results)})"
|
||||
logger.info(f"Comprehensive search: {provider_name} returned {len(search_results)} results")
|
||||
break
|
||||
empty = True
|
||||
except Exception as e:
|
||||
last_err = e
|
||||
logger.warning(f"Comprehensive search: {provider_name} attempt {attempt + 1} failed: {e}")
|
||||
if search_results:
|
||||
break
|
||||
if last_err is not None:
|
||||
provider_attempts[provider_name] = f"error: {last_err}"
|
||||
elif empty:
|
||||
provider_attempts[provider_name] = "empty"
|
||||
|
||||
if not search_results:
|
||||
tally = ", ".join(f"{p}:{r}" for p, r in provider_attempts.items()) or "no providers configured"
|
||||
any_errors = any(r.startswith("error") for r in provider_attempts.values())
|
||||
if any_errors:
|
||||
msg = f"Web search failed — all providers errored or returned empty. Tried: {tally}"
|
||||
else:
|
||||
msg = (
|
||||
f"No search results found. Tried: {tally}. "
|
||||
"All providers returned empty — possibly a niche query or upstream rate-limiting; "
|
||||
"rephrasing or using the browser tool for a specific URL may help."
|
||||
)
|
||||
logger.warning(msg)
|
||||
return (msg, []) if return_sources else msg
|
||||
|
||||
search_results = rank_search_results(query, search_results)
|
||||
|
||||
# URL filter helper
|
||||
def url_passes_filters(url: str) -> bool:
|
||||
try:
|
||||
netloc = urlparse(url).netloc.lower()
|
||||
except Exception:
|
||||
return False
|
||||
if domain_whitelist is not None and netloc not in domain_whitelist:
|
||||
return False
|
||||
if domain_blacklist is not None and netloc in domain_blacklist:
|
||||
return False
|
||||
if content_type:
|
||||
ct = content_type.lower()
|
||||
if ct == "article":
|
||||
if not any(k in url.lower() for k in ("article", "blog", "news", "post")):
|
||||
return False
|
||||
elif ct == "forum":
|
||||
if not any(k in url.lower() for k in ("forum", "discussion", "thread", "topic")):
|
||||
return False
|
||||
elif ct == "academic":
|
||||
if not any(k in url.lower() for k in ("pdf", "doi", "scholar", "arxiv", "journal", "research")):
|
||||
return False
|
||||
if language:
|
||||
lang_pat = language.lower()
|
||||
if not (f"/{lang_pat}/" in url.lower() or f"?lang={lang_pat}" in url.lower() or f"&lang={lang_pat}" in url.lower()):
|
||||
return False
|
||||
return True
|
||||
|
||||
filtered_urls = [r["url"] for r in search_results[:max_pages] if url_passes_filters(r["url"])]
|
||||
if not filtered_urls:
|
||||
logger.warning("All URLs filtered out by advanced criteria")
|
||||
msg = "No suitable results after applying filters."
|
||||
return (msg, []) if return_sources else msg
|
||||
|
||||
# Build sources list for the frontend (before content fetching)
|
||||
_source_list = [
|
||||
{"url": r.get("url", ""), "title": r.get("title", "")}
|
||||
for r in search_results if r.get("url")
|
||||
]
|
||||
|
||||
# Fetch content in parallel
|
||||
fetched_content = []
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
future_to_url = {
|
||||
executor.submit(fetch_webpage_content, url, 8, retry_attempt=0): url
|
||||
for url in filtered_urls
|
||||
}
|
||||
for future in as_completed(future_to_url):
|
||||
url = future_to_url[future]
|
||||
try:
|
||||
result = future.result()
|
||||
if result["success"] and result["content"] and len(result["content"]) >= min_content_length:
|
||||
fetched_content.append(result)
|
||||
except Exception as e:
|
||||
logger.error(f"Exception while fetching {url}: {str(e)}")
|
||||
|
||||
logger.info(f"Successfully fetched content from {len(fetched_content)} pages")
|
||||
|
||||
# Format results
|
||||
output_parts = []
|
||||
|
||||
if search_results:
|
||||
output_parts.append("```sources")
|
||||
for i, result in enumerate(search_results, 1):
|
||||
output_parts.append(f"[{i}] {result['title']}")
|
||||
output_parts.append(f" {result['url']}")
|
||||
if result.get("age"):
|
||||
output_parts.append(f" {result['age']}")
|
||||
output_parts.append("```")
|
||||
output_parts.append("")
|
||||
|
||||
output_parts.append("=" * 70)
|
||||
output_parts.append("WEB SEARCH RESULTS AND FETCHED CONTENT")
|
||||
output_parts.append(f"Query: {query}")
|
||||
output_parts.append(f"Searched {len(search_results)} results, fetched {len(fetched_content)} pages")
|
||||
output_parts.append("=" * 70)
|
||||
output_parts.append("")
|
||||
|
||||
output_parts.append("SEARCH RESULTS SUMMARY:")
|
||||
output_parts.append("-" * 50)
|
||||
for i, result in enumerate(search_results, 1):
|
||||
output_parts.append(f"\n[{i}] {result['title']}")
|
||||
output_parts.append(f" URL: {result['url']}")
|
||||
output_parts.append(f" Snippet: {result['snippet'][:200]}...")
|
||||
if result.get("age"):
|
||||
output_parts.append(f" Age: {result['age']}")
|
||||
|
||||
if fetched_content:
|
||||
output_parts.append("\n" + "=" * 70)
|
||||
output_parts.append("FETCHED PAGE CONTENT:")
|
||||
output_parts.append("-" * 50)
|
||||
|
||||
for i, content in enumerate(fetched_content, 1):
|
||||
output_parts.append(f"\n[CONTENT {i}] From: {content['url']}")
|
||||
output_parts.append(f"Title: {content['title']}")
|
||||
output_parts.append("-" * 30)
|
||||
|
||||
text = content["content"][:3000]
|
||||
if len(content["content"]) > 3000:
|
||||
text += "... [truncated]"
|
||||
output_parts.append(text)
|
||||
|
||||
key_points = extract_key_points(content["content"])
|
||||
if key_points:
|
||||
output_parts.append("\nKey Points:")
|
||||
for pt in key_points[:5]:
|
||||
output_parts.append(f"- {pt}")
|
||||
|
||||
tldr = get_tldr(content["content"])
|
||||
if tldr:
|
||||
output_parts.append("\nTL;DR:")
|
||||
output_parts.append(tldr)
|
||||
|
||||
quotes = extract_quotes(content["content"])
|
||||
if quotes:
|
||||
output_parts.append("\nImportant Quotes:")
|
||||
for q in quotes[:3]:
|
||||
output_parts.append(f"\u201c{q}\u201d")
|
||||
|
||||
stats = extract_statistics(content["content"])
|
||||
if stats:
|
||||
output_parts.append("\nData / Statistics:")
|
||||
for s in stats[:5]:
|
||||
output_parts.append(f"- {s}")
|
||||
|
||||
output_parts.append("")
|
||||
|
||||
output_parts.append("=" * 70)
|
||||
output_parts.append("END OF WEB SEARCH RESULTS")
|
||||
output_parts.append("=" * 70)
|
||||
|
||||
instructions = (
|
||||
"\n\nIMPORTANT INSTRUCTIONS:\n"
|
||||
"1. Use the above web search results and fetched content to answer the user's question\n"
|
||||
"2. Prioritize information from the FETCHED PAGE CONTENT section as it contains actual page data\n"
|
||||
"3. Cross-reference multiple sources when possible\n"
|
||||
"4. If the information is time-sensitive, pay attention to the age of the results\n"
|
||||
"5. Be explicit if the search results don't contain sufficient information to fully answer the question"
|
||||
)
|
||||
output_parts.append(instructions)
|
||||
|
||||
result = "\n".join(output_parts)
|
||||
return (result, _source_list) if return_sources else result
|
||||
@@ -0,0 +1,527 @@
|
||||
"""Search provider implementations: SearXNG, Brave, DuckDuckGo, Google PSE, Tavily, Serper."""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from typing import List, Optional
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from src.constants import SEARXNG_INSTANCE
|
||||
from .analytics import RateLimitError, error_logger
|
||||
from .query import build_enhanced_query
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
REQUEST_TIMEOUT = 20
|
||||
|
||||
# Provider registry — maps setting value to (label, needs_key, needs_url)
|
||||
PROVIDER_INFO = {
|
||||
"searxng": ("SearXNG", False, True),
|
||||
"brave": ("Brave Search", True, False),
|
||||
"duckduckgo": ("DuckDuckGo", False, False),
|
||||
"google_pse": ("Google PSE", True, False),
|
||||
"tavily": ("Tavily", True, False),
|
||||
"serper": ("Serper", True, False),
|
||||
"disabled": ("Disabled", False, False),
|
||||
}
|
||||
|
||||
|
||||
# ── Settings helpers ──
|
||||
|
||||
def _get_search_settings() -> dict:
|
||||
"""Return search settings from admin config, falling back to env defaults."""
|
||||
try:
|
||||
from src.settings import load_settings
|
||||
return load_settings()
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
|
||||
def _get_search_instance() -> str:
|
||||
"""Return the active search API URL from admin settings, falling back to env var."""
|
||||
settings = _get_search_settings()
|
||||
url = (settings.get("search_url") or "").strip()
|
||||
if url:
|
||||
return url.rstrip("/")
|
||||
return SEARXNG_INSTANCE
|
||||
|
||||
|
||||
def _get_provider_key(provider: str) -> str:
|
||||
"""Return the API key for a specific provider, with legacy fallback."""
|
||||
settings = _get_search_settings()
|
||||
key_map = {
|
||||
"brave": "brave_api_key",
|
||||
"google_pse": "google_pse_key",
|
||||
"tavily": "tavily_api_key",
|
||||
"serper": "serper_api_key",
|
||||
}
|
||||
field = key_map.get(provider, "")
|
||||
if field:
|
||||
val = (settings.get(field) or "").strip()
|
||||
if val:
|
||||
return val
|
||||
# Legacy fallback: old shared search_api_key field
|
||||
return (settings.get("search_api_key") or "").strip()
|
||||
|
||||
|
||||
def _get_result_count() -> int:
|
||||
"""Return configured result count, default 5."""
|
||||
settings = _get_search_settings()
|
||||
try:
|
||||
return int(settings.get("search_result_count", 5))
|
||||
except (ValueError, TypeError):
|
||||
return 5
|
||||
|
||||
|
||||
# ── SearXNG ──
|
||||
|
||||
_NEWS_HINTS = ("news", "nyheter", "headlines", "breaking", "latest", "today", "idag")
|
||||
|
||||
# Default general engines (google/duckduckgo/brave/startpage/wikipedia) are
|
||||
# routinely rate-limited / CAPTCHA-blocked on this instance and return nothing.
|
||||
# Pin engines that actually respond so non-news queries get results without any
|
||||
# third-party API fallback. Override via SEARXNG_GENERAL_ENGINES.
|
||||
_GENERAL_ENGINES = os.environ.get("SEARXNG_GENERAL_ENGINES", "bing,mojeek,presearch")
|
||||
|
||||
|
||||
def searxng_search_api(query: str, count: int = 10, categories: str = "general",
|
||||
time_filter: Optional[str] = None) -> List[dict]:
|
||||
"""Search using SearXNG JSON API. Returns list of {title, url, snippet}."""
|
||||
instance = _get_search_instance()
|
||||
api_key = ""
|
||||
headers = {"User-Agent": "Mozilla/5.0"}
|
||||
if api_key:
|
||||
headers["Authorization"] = f"Bearer {api_key}"
|
||||
# News/fresh queries do badly in the 'general' category — it favours
|
||||
# encyclopedic/tourism pages, ignores recency, and (with no language pin)
|
||||
# bleeds in foreign-language results. When the agent layer detected
|
||||
# freshness (time_filter) or the query reads like a news lookup, switch to
|
||||
# the 'news' category, constrain recency, and pin language to English so a
|
||||
# search like "Canada latest news" returns actual news instead of Wikipedia.
|
||||
# Pin English for ALL searches — without it, SearXNG geolocates / mixes
|
||||
# languages and brand-ambiguous terms bleed in foreign SEO pages (e.g.
|
||||
# "Odyssey" → Honda Japan, "Trojan" → Japanese malware blogs, "Polyphemus"
|
||||
# → Chinese math forums). The news path already did this; general didn't.
|
||||
params = {"q": query, "format": "json", "language": "en"}
|
||||
q_lc = query.lower()
|
||||
is_news = time_filter is not None or any(h in q_lc for h in _NEWS_HINTS)
|
||||
if is_news and categories == "general":
|
||||
params["categories"] = "news"
|
||||
if time_filter in ("day", "week", "month", "year"):
|
||||
# 'day' is too sparse on most SearXNG news engines — widen to a week
|
||||
# so there's enough volume; the news category already biases recent.
|
||||
params["time_range"] = "week" if time_filter in ("day", "week") else time_filter
|
||||
else:
|
||||
params["categories"] = categories
|
||||
# Route general queries to engines that aren't blocked (default general
|
||||
# set returns 0 on this instance — see _GENERAL_ENGINES).
|
||||
if categories == "general" and _GENERAL_ENGINES:
|
||||
params["engines"] = _GENERAL_ENGINES
|
||||
try:
|
||||
def _parse_results(results):
|
||||
return [
|
||||
{
|
||||
"title": r.get("title", ""),
|
||||
"url": r.get("url", ""),
|
||||
"snippet": r.get("content", ""),
|
||||
}
|
||||
for r in results[:count]
|
||||
if r.get("url")
|
||||
]
|
||||
|
||||
def _run(search_params):
|
||||
response = httpx.get(
|
||||
f"{instance}/search",
|
||||
params=search_params,
|
||||
headers=headers or None,
|
||||
timeout=15,
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
return _parse_results(data.get("results", [])), data
|
||||
|
||||
active_params = params
|
||||
parsed, data = _run(active_params)
|
||||
if not parsed and is_news and categories == "general":
|
||||
# Some self-hosted SearXNG configs have no working news engines.
|
||||
# Fall back to the known-good general engines before reporting an
|
||||
# empty search, otherwise common queries like "Canada news" fail.
|
||||
fallback = {
|
||||
"q": query,
|
||||
"format": "json",
|
||||
"language": "en",
|
||||
"categories": "general",
|
||||
}
|
||||
if _GENERAL_ENGINES:
|
||||
fallback["engines"] = _GENERAL_ENGINES
|
||||
logger.info(
|
||||
"SearXNG news search returned 0 results for %r; retrying general engines",
|
||||
query,
|
||||
)
|
||||
active_params = fallback
|
||||
parsed, data = _run(active_params)
|
||||
if not parsed and active_params.get("language"):
|
||||
fallback = dict(active_params)
|
||||
fallback.pop("language", None)
|
||||
logger.info(
|
||||
"SearXNG language-pinned search returned 0 results for %r; retrying without language",
|
||||
query,
|
||||
)
|
||||
active_params = fallback
|
||||
parsed, data = _run(active_params)
|
||||
if not parsed and active_params.get("engines"):
|
||||
fallback = dict(active_params)
|
||||
fallback.pop("engines", None)
|
||||
logger.info(
|
||||
"SearXNG pinned engines returned 0 results for %r; retrying default engines",
|
||||
query,
|
||||
)
|
||||
parsed, data = _run(fallback)
|
||||
logger.info(f"SearXNG JSON API returned {len(parsed)} results for: {query}")
|
||||
if not parsed:
|
||||
unresponsive = data.get("unresponsive_engines") if isinstance(data, dict) else None
|
||||
if unresponsive:
|
||||
logger.info(f"SearXNG unresponsive engines for {query!r}: {unresponsive}")
|
||||
return parsed
|
||||
except Exception as e:
|
||||
logger.warning(f"SearXNG JSON API search failed: {e}")
|
||||
html_results = searxng_search(query, max_results=count)
|
||||
if html_results:
|
||||
logger.info(f"SearXNG HTML fallback returned {len(html_results)} results for: {query}")
|
||||
return html_results
|
||||
|
||||
|
||||
def searxng_search(query, max_results=10):
|
||||
"""Search using SearXNG instance - parsing HTML."""
|
||||
instance = _get_search_instance()
|
||||
api_key = ""
|
||||
req_headers = {"User-Agent": "Mozilla/5.0"}
|
||||
if api_key:
|
||||
req_headers["Authorization"] = f"Bearer {api_key}"
|
||||
try:
|
||||
response = httpx.get(
|
||||
f"{instance}/search",
|
||||
params={"q": query},
|
||||
headers=req_headers,
|
||||
timeout=10,
|
||||
)
|
||||
if response.is_success:
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
results = []
|
||||
for article in soup.select("article.result")[:max_results]:
|
||||
title_elem = article.select_one("h3 a")
|
||||
if not title_elem:
|
||||
continue
|
||||
title = title_elem.get_text(strip=True)
|
||||
url = title_elem.get("href", "")
|
||||
snippet_elem = article.select_one("p.content")
|
||||
snippet = snippet_elem.get_text(strip=True) if snippet_elem else ""
|
||||
results.append({"title": title, "url": url, "snippet": snippet})
|
||||
logger.info(f"SearXNG search (HTML) returned {len(results)} results")
|
||||
return results
|
||||
except Exception as e:
|
||||
logger.error(f"SearXNG search failed: {e}")
|
||||
return []
|
||||
|
||||
|
||||
# ── Brave ──
|
||||
|
||||
def brave_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
|
||||
"""Search using Brave API with key from admin settings or env var."""
|
||||
api_key = _get_provider_key("brave") or os.environ.get("DATA_BRAVE_API_KEY") or ""
|
||||
return _brave_search_impl(query, count, time_filter, search_config={"brave_api_key": api_key})
|
||||
|
||||
|
||||
def _brave_search_impl(query: str, count: int, time_filter: Optional[str] = None, search_config: dict = None) -> List[dict]:
|
||||
"""Core Brave API call. Returns a list of result dicts or an empty list on failure."""
|
||||
enhanced_query = build_enhanced_query(query, time_filter)
|
||||
config = search_config or {}
|
||||
|
||||
brave_api_key = config.get("brave_api_key")
|
||||
if not brave_api_key:
|
||||
brave_api_key = os.environ.get("DATA_BRAVE_API_KEY")
|
||||
|
||||
if not brave_api_key:
|
||||
logger.warning("Brave API key not found, returning empty results for fallback")
|
||||
return []
|
||||
|
||||
headers = {"X-Subscription-Token": brave_api_key, "Accept": "application/json"}
|
||||
params = {"q": enhanced_query, "count": count}
|
||||
if time_filter:
|
||||
time_map = {"day": "day", "week": "week", "month": "month", "year": "year"}
|
||||
if time_filter in time_map:
|
||||
params["freshness"] = time_map[time_filter]
|
||||
|
||||
logger.info(f"Executing Brave search with query: {enhanced_query}")
|
||||
try:
|
||||
response = httpx.get(
|
||||
"https://api.search.brave.com/res/v1/web/search",
|
||||
headers=headers,
|
||||
params=params,
|
||||
timeout=REQUEST_TIMEOUT,
|
||||
)
|
||||
if response.status_code == 429:
|
||||
raise RateLimitError("Brave rate limit hit")
|
||||
response.raise_for_status()
|
||||
except httpx.RequestError as e:
|
||||
error_logger.error(f"NetworkError during Brave search: {e}")
|
||||
return []
|
||||
except RateLimitError as e:
|
||||
error_logger.error(str(e))
|
||||
return []
|
||||
|
||||
try:
|
||||
data = response.json()
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Failed to parse Brave API response: {e}")
|
||||
return []
|
||||
|
||||
results = []
|
||||
if "web" in data and "results" in data["web"]:
|
||||
for item in data["web"]["results"][:count]:
|
||||
url = item.get("url", "")
|
||||
if not url:
|
||||
continue
|
||||
results.append({
|
||||
"title": item.get("title", ""),
|
||||
"url": url,
|
||||
"snippet": item.get("description", "") or item.get("content", ""),
|
||||
"age": item.get("date", "") if item.get("date") else "",
|
||||
})
|
||||
|
||||
logger.info(f"Brave search returned {len(results)} results")
|
||||
return results
|
||||
|
||||
|
||||
# ── DuckDuckGo (free, no key) ──
|
||||
|
||||
def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
|
||||
"""Search using DuckDuckGo via the duckduckgo-search library. No API key needed."""
|
||||
def _html_fallback() -> List[dict]:
|
||||
try:
|
||||
response = httpx.get(
|
||||
"https://html.duckduckgo.com/html/",
|
||||
params={"q": query},
|
||||
headers={"User-Agent": "Mozilla/5.0"},
|
||||
timeout=REQUEST_TIMEOUT,
|
||||
)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
parsed = []
|
||||
for result in soup.select(".result")[:count]:
|
||||
link = result.select_one(".result__a")
|
||||
if not link:
|
||||
continue
|
||||
url = link.get("href", "")
|
||||
if not url:
|
||||
continue
|
||||
snippet_el = result.select_one(".result__snippet")
|
||||
parsed.append({
|
||||
"title": link.get_text(" ", strip=True),
|
||||
"url": url,
|
||||
"snippet": snippet_el.get_text(" ", strip=True) if snippet_el else "",
|
||||
})
|
||||
logger.info(f"DuckDuckGo HTML search returned {len(parsed)} results")
|
||||
return parsed
|
||||
except Exception as e:
|
||||
logger.warning(f"DuckDuckGo HTML search failed: {e}")
|
||||
return []
|
||||
|
||||
try:
|
||||
from duckduckgo_search import DDGS
|
||||
except ImportError:
|
||||
logger.warning("duckduckgo-search package not installed; using HTML fallback")
|
||||
return _html_fallback()
|
||||
|
||||
timelimit = None
|
||||
if time_filter:
|
||||
time_map = {"day": "d", "week": "w", "month": "m", "year": "y"}
|
||||
timelimit = time_map.get(time_filter)
|
||||
|
||||
try:
|
||||
ddgs = DDGS()
|
||||
raw = ddgs.text(query, max_results=count, timelimit=timelimit)
|
||||
results = []
|
||||
for item in raw:
|
||||
url = item.get("href", "")
|
||||
if not url:
|
||||
continue
|
||||
results.append({
|
||||
"title": item.get("title", ""),
|
||||
"url": url,
|
||||
"snippet": item.get("body", ""),
|
||||
})
|
||||
logger.info(f"DuckDuckGo search returned {len(results)} results")
|
||||
return results or _html_fallback()
|
||||
except Exception as e:
|
||||
logger.warning(f"DuckDuckGo search failed: {e}")
|
||||
return _html_fallback()
|
||||
|
||||
|
||||
# ── Google Programmable Search Engine ──
|
||||
|
||||
def google_pse_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
|
||||
"""Search using Google PSE (Custom Search JSON API).
|
||||
|
||||
Requires two keys in settings:
|
||||
- search_api_key: Google API key
|
||||
- google_pse_cx: Programmable Search Engine ID (cx)
|
||||
Or env vars GOOGLE_API_KEY and GOOGLE_PSE_CX.
|
||||
"""
|
||||
settings = _get_search_settings()
|
||||
api_key = _get_provider_key("google_pse") or os.environ.get("GOOGLE_API_KEY", "")
|
||||
cx = (settings.get("google_pse_cx") or "").strip() or os.environ.get("GOOGLE_PSE_CX", "")
|
||||
|
||||
if not api_key or not cx:
|
||||
logger.warning("Google PSE: missing API key or CX ID")
|
||||
return []
|
||||
|
||||
params = {
|
||||
"key": api_key,
|
||||
"cx": cx,
|
||||
"q": query,
|
||||
"num": min(count, 10), # Google PSE max is 10 per request
|
||||
}
|
||||
if time_filter:
|
||||
# dateRestrict: d[number], w[number], m[number], y[number]
|
||||
time_map = {"day": "d1", "week": "w1", "month": "m1", "year": "y1"}
|
||||
if time_filter in time_map:
|
||||
params["dateRestrict"] = time_map[time_filter]
|
||||
|
||||
try:
|
||||
response = httpx.get(
|
||||
"https://www.googleapis.com/customsearch/v1",
|
||||
params=params,
|
||||
timeout=REQUEST_TIMEOUT,
|
||||
)
|
||||
if response.status_code == 429:
|
||||
raise RateLimitError("Google PSE rate limit hit")
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except httpx.RequestError as e:
|
||||
error_logger.error(f"Google PSE search failed: {e}")
|
||||
return []
|
||||
except RateLimitError as e:
|
||||
error_logger.error(str(e))
|
||||
return []
|
||||
|
||||
results = []
|
||||
for item in data.get("items", [])[:count]:
|
||||
url = item.get("link", "")
|
||||
if not url:
|
||||
continue
|
||||
results.append({
|
||||
"title": item.get("title", ""),
|
||||
"url": url,
|
||||
"snippet": item.get("snippet", ""),
|
||||
})
|
||||
|
||||
logger.info(f"Google PSE returned {len(results)} results")
|
||||
return results
|
||||
|
||||
|
||||
# ── Tavily ──
|
||||
|
||||
def tavily_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
|
||||
"""Search using Tavily API. Requires search_api_key or TAVILY_API_KEY env var."""
|
||||
api_key = _get_provider_key("tavily") or os.environ.get("TAVILY_API_KEY", "")
|
||||
if not api_key:
|
||||
logger.warning("Tavily: no API key configured")
|
||||
return []
|
||||
|
||||
payload = {
|
||||
"query": query,
|
||||
"max_results": count,
|
||||
"include_answer": False,
|
||||
}
|
||||
if time_filter:
|
||||
time_map = {"day": "day", "week": "week", "month": "month", "year": "year"}
|
||||
if time_filter in time_map:
|
||||
payload["days"] = {"day": 1, "week": 7, "month": 30, "year": 365}[time_filter]
|
||||
|
||||
try:
|
||||
response = httpx.post(
|
||||
"https://api.tavily.com/search",
|
||||
json=payload,
|
||||
headers={"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"},
|
||||
timeout=REQUEST_TIMEOUT,
|
||||
)
|
||||
if response.status_code == 429:
|
||||
raise RateLimitError("Tavily rate limit hit")
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except httpx.RequestError as e:
|
||||
error_logger.error(f"Tavily search failed: {e}")
|
||||
return []
|
||||
except RateLimitError as e:
|
||||
error_logger.error(str(e))
|
||||
return []
|
||||
|
||||
results = []
|
||||
for item in data.get("results", [])[:count]:
|
||||
url = item.get("url", "")
|
||||
if not url:
|
||||
continue
|
||||
results.append({
|
||||
"title": item.get("title", ""),
|
||||
"url": url,
|
||||
"snippet": item.get("content", ""),
|
||||
"age": item.get("published_date", ""),
|
||||
})
|
||||
|
||||
logger.info(f"Tavily returned {len(results)} results")
|
||||
return results
|
||||
|
||||
|
||||
# ── Serper.dev ──
|
||||
|
||||
def serper_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
|
||||
"""Search using Serper.dev API. Requires search_api_key or SERPER_API_KEY env var."""
|
||||
api_key = _get_provider_key("serper") or os.environ.get("SERPER_API_KEY", "")
|
||||
if not api_key:
|
||||
logger.warning("Serper: no API key configured")
|
||||
return []
|
||||
|
||||
payload = {
|
||||
"q": query,
|
||||
"num": count,
|
||||
}
|
||||
if time_filter:
|
||||
time_map = {"day": "qdr:d", "week": "qdr:w", "month": "qdr:m", "year": "qdr:y"}
|
||||
if time_filter in time_map:
|
||||
payload["tbs"] = time_map[time_filter]
|
||||
|
||||
try:
|
||||
response = httpx.post(
|
||||
"https://google.serper.dev/search",
|
||||
json=payload,
|
||||
headers={"X-API-KEY": api_key, "Content-Type": "application/json"},
|
||||
timeout=REQUEST_TIMEOUT,
|
||||
)
|
||||
if response.status_code == 429:
|
||||
raise RateLimitError("Serper rate limit hit")
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except httpx.RequestError as e:
|
||||
error_logger.error(f"Serper search failed: {e}")
|
||||
return []
|
||||
except RateLimitError as e:
|
||||
error_logger.error(str(e))
|
||||
return []
|
||||
|
||||
results = []
|
||||
for item in data.get("organic", [])[:count]:
|
||||
url = item.get("link", "")
|
||||
if not url:
|
||||
continue
|
||||
results.append({
|
||||
"title": item.get("title", ""),
|
||||
"url": url,
|
||||
"snippet": item.get("snippet", ""),
|
||||
"age": item.get("date", ""),
|
||||
})
|
||||
|
||||
logger.info(f"Serper returned {len(results)} results")
|
||||
return results
|
||||
@@ -0,0 +1,128 @@
|
||||
"""Query enhancement, entity extraction, and cache duration helpers."""
|
||||
|
||||
import re
|
||||
import logging
|
||||
from datetime import timedelta
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Query processing helpers
|
||||
# ----------------------------------------------------------------------
|
||||
def _detect_question_type(query: str) -> Optional[str]:
|
||||
"""Return the leading question word if present (who, what, when, where, why, how)."""
|
||||
q = query.strip().lower()
|
||||
for word in ("who", "what", "when", "where", "why", "how"):
|
||||
if q.startswith(word):
|
||||
return word
|
||||
return None
|
||||
|
||||
|
||||
def _extract_entities(query: str) -> Dict[str, List[str]]:
|
||||
"""Lightweight entity extraction: capitalized words and date patterns."""
|
||||
entities: Dict[str, List[str]] = {"names": [], "dates": []}
|
||||
qtype = _detect_question_type(query)
|
||||
cleaned = query
|
||||
if qtype:
|
||||
cleaned = re.sub(rf"^{qtype}\b", "", cleaned, flags=re.I).strip()
|
||||
for token in re.findall(r"\b[A-Z][a-zA-Z]+\b", cleaned):
|
||||
entities["names"].append(token)
|
||||
for year in re.findall(r"\b(19|20)\d{2}\b", cleaned):
|
||||
entities["dates"].append(year)
|
||||
month_day_year = re.findall(
|
||||
r"\b(?:Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|August|Sep|Sept|September|Oct|October|Nov|November|Dec|December)\s+\d{1,2},?\s*\d{4}\b",
|
||||
cleaned,
|
||||
flags=re.I,
|
||||
)
|
||||
entities["dates"].extend(month_day_year)
|
||||
return entities
|
||||
|
||||
|
||||
def _split_multi_part(query: str) -> List[str]:
|
||||
"""Split a query into sub-queries on common conjunctions."""
|
||||
parts = re.split(r"\s+and\s+|\s+or\s+|;", query, flags=re.I)
|
||||
return [p.strip() for p in parts if p.strip()]
|
||||
|
||||
|
||||
def _extract_site_filter(query: str) -> Tuple[str, Optional[str]]:
|
||||
"""Detect a 'site:example.com' token. Returns (query_without_token, site_or_None)."""
|
||||
match = re.search(r"\bsite:([^\s]+)", query, flags=re.I)
|
||||
if match:
|
||||
site = match.group(1)
|
||||
new_query = re.sub(r"\bsite:[^\s]+", "", query, flags=re.I).strip()
|
||||
return new_query, site
|
||||
return query, None
|
||||
|
||||
|
||||
def _boost_entities_in_query(base_query: str, entities: Dict[str, List[str]]) -> str:
|
||||
"""Append extracted entities to the query using OR to increase relevance."""
|
||||
parts = [base_query]
|
||||
if entities.get("names"):
|
||||
parts.append(" OR ".join(f'"{n}"' for n in entities["names"]))
|
||||
if entities.get("dates"):
|
||||
parts.append(" OR ".join(f'"{d}"' for d in entities["dates"]))
|
||||
return " ".join(parts)
|
||||
|
||||
|
||||
def enhance_query(original_query: str) -> Tuple[str, Optional[str]]:
|
||||
"""Process the original query: site filter, question type boosts, entity extraction."""
|
||||
query_without_site, site = _extract_site_filter(original_query)
|
||||
sub_queries = _split_multi_part(query_without_site)
|
||||
|
||||
enhanced_subs: List[str] = []
|
||||
for sub in sub_queries:
|
||||
qtype = _detect_question_type(sub)
|
||||
boost_keywords = []
|
||||
if qtype == "who":
|
||||
boost_keywords.append("person")
|
||||
elif qtype == "when":
|
||||
boost_keywords.append("date")
|
||||
elif qtype == "where":
|
||||
boost_keywords.append("location")
|
||||
elif qtype == "why":
|
||||
boost_keywords.append("reason")
|
||||
elif qtype == "how":
|
||||
boost_keywords.append("method")
|
||||
entities = _extract_entities(sub)
|
||||
boosted = _boost_entities_in_query(sub, entities)
|
||||
if boost_keywords:
|
||||
boosted = f'({boosted}) OR ({" OR ".join(boost_keywords)})'
|
||||
enhanced_subs.append(boosted)
|
||||
|
||||
final_query = " AND ".join(f"({s})" for s in enhanced_subs)
|
||||
if site:
|
||||
final_query = f"{final_query} site:{site}"
|
||||
return final_query, site
|
||||
|
||||
|
||||
def build_enhanced_query(query: str, time_filter: str = None) -> str:
|
||||
"""Build an enhanced search query with optional time filtering."""
|
||||
enhanced_query, _ = enhance_query(query)
|
||||
|
||||
if time_filter:
|
||||
time_map = {"day": "d", "week": "w", "month": "m", "year": "y"}
|
||||
if time_filter in time_map:
|
||||
enhanced_query = f"{enhanced_query} after:{time_map[time_filter]}"
|
||||
logger.info(f"Added time filter '{time_filter}' to query")
|
||||
|
||||
logger.info(f"Enhanced query: '{query}' -> '{enhanced_query}'")
|
||||
return enhanced_query
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Cache duration helpers
|
||||
# ----------------------------------------------------------------------
|
||||
def _is_news_query(query: str) -> bool:
|
||||
"""Lightweight heuristic to decide if a query is news-oriented."""
|
||||
news_terms = {"news", "latest", "breaking", "today", "today's", "current", "updates", "happening"}
|
||||
tokens = set(re.findall(r"\b\w+\b", query.lower()))
|
||||
return bool(tokens & news_terms)
|
||||
|
||||
|
||||
def _cache_duration_for_query(query: str) -> timedelta:
|
||||
"""News queries -> 30 minutes, reference queries -> 24 hours."""
|
||||
if _is_news_query(query):
|
||||
return timedelta(minutes=30)
|
||||
return timedelta(hours=24)
|
||||
@@ -0,0 +1,127 @@
|
||||
"""Search result ranking based on relevance, source quality, and recency."""
|
||||
|
||||
import re
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import List, Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_NEWS_HINTS = {"news", "nyheter", "headlines", "breaking", "latest", "today", "idag"}
|
||||
_SPORTS_HINTS = {
|
||||
"sport", "sports", "soccer", "football", "hockey", "nba", "nfl", "mlb",
|
||||
"fifa", "world cup", "championship", "quarterfinal", "eliminates",
|
||||
}
|
||||
_LOW_VALUE_NEWS_DOMAINS = {
|
||||
"facebook.com", "www.facebook.com", "sports.yahoo.com", "yahoo.com",
|
||||
"www.yahoo.com", "msn.com", "www.msn.com",
|
||||
}
|
||||
_TRUSTED_NEWS_DOMAINS = {
|
||||
"apnews.com", "www.apnews.com", "reuters.com", "www.reuters.com",
|
||||
"bbc.com", "www.bbc.com", "cbc.ca", "www.cbc.ca",
|
||||
"ctvnews.ca", "www.ctvnews.ca", "globalnews.ca", "www.globalnews.ca",
|
||||
"theguardian.com",
|
||||
"www.theguardian.com", "euronews.com", "www.euronews.com",
|
||||
"dw.com", "www.dw.com", "government.se", "www.government.se",
|
||||
}
|
||||
|
||||
|
||||
def _domain(url: str) -> str:
|
||||
try:
|
||||
return urlparse(url).netloc.lower()
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def rank_search_results(query: str, results: List[dict]) -> List[dict]:
|
||||
"""Rank search results by title relevance, snippet quality, domain authority, and recency."""
|
||||
query_terms = [t.lower() for t in re.findall(r"\b\w+\b", query)]
|
||||
query_lc = query.lower()
|
||||
is_news_query = any(term in _NEWS_HINTS for term in query_terms)
|
||||
is_sports_query = any(hint in query_lc for hint in _SPORTS_HINTS)
|
||||
|
||||
def title_score(title: str) -> float:
|
||||
if not title:
|
||||
return 0.0
|
||||
title_lc = title.lower()
|
||||
matches = sum(1 for term in query_terms if re.search(rf"\b{re.escape(term)}\b", title_lc))
|
||||
return matches / len(query_terms) if query_terms else 0.0
|
||||
|
||||
def snippet_score(snippet: str) -> float:
|
||||
if not snippet:
|
||||
return 0.0
|
||||
length_factor = min(len(snippet), 200) / 200
|
||||
term_hits = sum(1 for term in query_terms if term in snippet.lower())
|
||||
term_factor = term_hits / len(query_terms) if query_terms else 0.0
|
||||
return (length_factor + term_factor) / 2
|
||||
|
||||
def domain_score(url: str) -> float:
|
||||
netloc = _domain(url)
|
||||
if not netloc:
|
||||
return 0.0
|
||||
if netloc in _TRUSTED_NEWS_DOMAINS:
|
||||
return 1.0
|
||||
if netloc.endswith(".edu") or netloc.endswith(".gov"):
|
||||
return 1.0
|
||||
if netloc.endswith(".org"):
|
||||
return 0.7
|
||||
return 0.4
|
||||
|
||||
def recency_score(age_str: Optional[str]) -> float:
|
||||
if not age_str:
|
||||
return 0.0
|
||||
for fmt in ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S"):
|
||||
try:
|
||||
dt = datetime.strptime(age_str, fmt)
|
||||
break
|
||||
except Exception:
|
||||
dt = None
|
||||
if not dt:
|
||||
return 0.0
|
||||
days_old = (datetime.now() - dt).days
|
||||
if days_old <= 7:
|
||||
return 1.0
|
||||
if days_old >= 30:
|
||||
return 0.0
|
||||
return (30 - days_old) / 23
|
||||
|
||||
def news_quality_adjustment(title: str, snippet: str, url: str) -> float:
|
||||
if not is_news_query:
|
||||
return 0.0
|
||||
text = f"{title} {snippet}".lower()
|
||||
netloc = _domain(url)
|
||||
adjustment = 0.0
|
||||
if netloc in _TRUSTED_NEWS_DOMAINS:
|
||||
adjustment += 1.2
|
||||
if any(term in text for term in ("latest news", "breaking news", "daily coverage", "news from")):
|
||||
adjustment += 0.4
|
||||
if netloc in _LOW_VALUE_NEWS_DOMAINS:
|
||||
adjustment -= 0.8
|
||||
if not is_sports_query and any(hint in text or hint in netloc for hint in _SPORTS_HINTS):
|
||||
adjustment -= 1.5
|
||||
# A country/news query should not rank a page whose title/snippet barely
|
||||
# mentions the country above actual news pages for that country.
|
||||
subject_terms = [t for t in query_terms if t not in _NEWS_HINTS]
|
||||
if subject_terms and not any(t in text or t in netloc for t in subject_terms):
|
||||
adjustment -= 1.0
|
||||
return adjustment
|
||||
|
||||
ranked = []
|
||||
for result in results:
|
||||
title = result.get("title", "")
|
||||
snippet = result.get("snippet", "")
|
||||
url = result.get("url", "")
|
||||
age = result.get("age", None)
|
||||
|
||||
score = (
|
||||
2.0 * title_score(title)
|
||||
+ 1.0 * snippet_score(snippet)
|
||||
+ 1.5 * domain_score(url)
|
||||
+ 1.0 * recency_score(age)
|
||||
+ news_quality_adjustment(title, snippet, url)
|
||||
)
|
||||
ranked.append((score, result))
|
||||
|
||||
ranked.sort(key=lambda x: x[0], reverse=True)
|
||||
return [r for _, r in ranked]
|
||||
@@ -0,0 +1,95 @@
|
||||
# services/search/service.py
|
||||
"""Search service — clean interface for web search."""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional, Dict, Any
|
||||
|
||||
from . import (
|
||||
comprehensive_web_search,
|
||||
fetch_webpage_content,
|
||||
get_search_config,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SearchResult:
|
||||
"""A single search result."""
|
||||
url: str
|
||||
title: str
|
||||
snippet: str
|
||||
content: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class SearchResponse:
|
||||
"""Response from a search query."""
|
||||
query: str
|
||||
results: List[SearchResult]
|
||||
total: int
|
||||
cached: bool = False
|
||||
|
||||
|
||||
class SearchService:
|
||||
"""
|
||||
Web search service.
|
||||
|
||||
Usage:
|
||||
service = SearchService()
|
||||
result = await service.search("python async patterns")
|
||||
for r in result.results:
|
||||
print(f"{r.title}: {r.url}")
|
||||
"""
|
||||
|
||||
def __init__(self, default_depth: int = 1, fetch_content: bool = True):
|
||||
self.default_depth = default_depth
|
||||
self.fetch_content = fetch_content
|
||||
|
||||
async def search(
|
||||
self,
|
||||
query: str,
|
||||
depth: Optional[int] = None,
|
||||
fetch_content: Optional[bool] = None,
|
||||
) -> SearchResponse:
|
||||
"""
|
||||
Search the web.
|
||||
|
||||
Args:
|
||||
query: Search query
|
||||
depth: Search depth (1=quick, 2=thorough, 3=comprehensive)
|
||||
fetch_content: Whether to fetch full page content
|
||||
|
||||
Returns:
|
||||
SearchResponse with results
|
||||
"""
|
||||
depth = depth or self.default_depth
|
||||
fetch_content = fetch_content if fetch_content is not None else self.fetch_content
|
||||
|
||||
# Use existing search implementation
|
||||
raw_results = await comprehensive_web_search(
|
||||
query,
|
||||
max_results=10 * depth,
|
||||
fetch_content=fetch_content,
|
||||
)
|
||||
|
||||
results = []
|
||||
for r in raw_results:
|
||||
results.append(SearchResult(
|
||||
url=r.get("url", ""),
|
||||
title=r.get("title", ""),
|
||||
snippet=r.get("snippet", ""),
|
||||
content=r.get("content"),
|
||||
))
|
||||
|
||||
return SearchResponse(
|
||||
query=query,
|
||||
results=results,
|
||||
total=len(results),
|
||||
)
|
||||
|
||||
async def fetch_content(self, url: str) -> Optional[str]:
|
||||
"""Fetch content from a URL."""
|
||||
return await fetch_webpage_content(url)
|
||||
|
||||
def get_config(self) -> Dict[str, Any]:
|
||||
"""Get current search configuration."""
|
||||
return get_search_config()
|
||||
Reference in New Issue
Block a user