Odysseus v1.0

This commit is contained in:
pewdiepie-archdaemon
2026-05-31 23:58:26 +09:00
commit e5c99a5eee
421 changed files with 271349 additions and 0 deletions
+35
View File
@@ -0,0 +1,35 @@
"""Search service — web search with SearXNG."""
from .core import (
comprehensive_web_search,
get_search_config,
invalidate_search_cache,
searxng_search_results,
update_search_config,
)
from .content import fetch_webpage_content
from .providers import searxng_search, searxng_search_api, PROVIDER_INFO
from .analytics import get_search_stats, SearchEngineError, NetworkError, ParseError, RateLimitError
from .service import SearchService, SearchResult, SearchResponse
__all__ = [
# Service interface (preferred)
"SearchService",
"SearchResult",
"SearchResponse",
# Low-level functions (for backwards compat)
"comprehensive_web_search",
"fetch_webpage_content",
"get_search_config",
"get_search_stats",
"invalidate_search_cache",
"searxng_search",
"searxng_search_api",
"searxng_search_results",
"update_search_config",
"PROVIDER_INFO",
"SearchEngineError",
"NetworkError",
"ParseError",
"RateLimitError",
]
+136
View File
@@ -0,0 +1,136 @@
"""Search analytics, metrics tracking, and exception hierarchy."""
import json
import logging
from collections import Counter
from pathlib import Path
from typing import Dict, Any
from .cache import cache_metrics
logger = logging.getLogger(__name__)
# Dedicated error logger with file handler
_error_log_path = Path(__file__).resolve().parent.parent / "search_engine_error.log"
_error_handler = logging.FileHandler(_error_log_path, encoding="utf-8")
_error_handler.setLevel(logging.WARNING)
_error_handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(name)s %(message)s"))
error_logger = logging.getLogger("search_engine_error")
error_logger.addHandler(_error_handler)
error_logger.propagate = False
# Analytics file
ANALYTICS_FILE = Path(__file__).resolve().parent.parent / "search_analytics.json"
# ----------------------------------------------------------------------
# Custom exception hierarchy
# ----------------------------------------------------------------------
class SearchEngineError(Exception):
"""Base class for all search-engine related errors."""
class NetworkError(SearchEngineError):
"""Raised when a network request fails (e.g., timeout, DNS error)."""
class ParseError(SearchEngineError):
"""Raised when HTML or other content cannot be parsed."""
class RateLimitError(SearchEngineError):
"""Raised when the remote service returns a rate-limit (HTTP 429)."""
# ----------------------------------------------------------------------
# Analytics helpers
# ----------------------------------------------------------------------
def _load_analytics() -> Dict[str, Any]:
"""Load analytics data from the JSON file, creating defaults if missing."""
if not ANALYTICS_FILE.exists():
default = {
"total_queries": 0,
"successful_queries": 0,
"failed_queries": 0,
"cache_hits": 0,
"cache_misses": 0,
"query_patterns": {},
}
_save_analytics(default)
return default
try:
with open(ANALYTICS_FILE, "r", encoding="utf-8") as f:
return json.load(f)
except Exception as e:
logger.warning(f"Failed to load analytics file: {e}")
return {
"total_queries": 0,
"successful_queries": 0,
"failed_queries": 0,
"cache_hits": 0,
"cache_misses": 0,
"query_patterns": {},
}
def _save_analytics(data: Dict[str, Any]) -> None:
"""Persist analytics data to the JSON file."""
try:
with open(ANALYTICS_FILE, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2)
except Exception as e:
logger.warning(f"Failed to write analytics file: {e}")
def _record_query(query: str, success: bool, cache_hit: bool) -> None:
"""Update analytics for a single query execution."""
analytics = _load_analytics()
analytics["total_queries"] += 1
if success:
analytics["successful_queries"] += 1
else:
analytics["failed_queries"] += 1
if cache_hit:
analytics["cache_hits"] += 1
cache_metrics["hits"] += 1
else:
analytics["cache_misses"] += 1
cache_metrics["misses"] += 1
patterns = analytics["query_patterns"]
entry = patterns.get(query, {"count": 0, "successes": 0})
entry["count"] += 1
if success:
entry["successes"] += 1
patterns[query] = entry
_save_analytics(analytics)
def get_search_stats() -> Dict[str, Any]:
"""Return aggregated search analytics."""
analytics = _load_analytics()
total = analytics.get("total_queries", 0) or 1
success_rate = analytics.get("successful_queries", 0) / total
cache_total = analytics.get("cache_hits", 0) + analytics.get("cache_misses", 0) or 1
cache_hit_rate = analytics.get("cache_hits", 0) / cache_total
pattern_counter = Counter({
q: data["count"] for q, data in analytics.get("query_patterns", {}).items()
})
most_common = [q for q, _ in pattern_counter.most_common(5)]
return {
"most_common_queries": most_common,
"success_rate": success_rate,
"cache_hit_rate": cache_hit_rate,
"total_queries": analytics.get("total_queries", 0),
"successful_queries": analytics.get("successful_queries", 0),
"failed_queries": analytics.get("failed_queries", 0),
"cache_hits": analytics.get("cache_hits", 0),
"cache_misses": analytics.get("cache_misses", 0),
"cache_evictions": cache_metrics["evictions"],
"runtime_cache_hits": cache_metrics["hits"],
"runtime_cache_misses": cache_metrics["misses"],
}
+57
View File
@@ -0,0 +1,57 @@
"""Search and content caching with LRU eviction."""
import hashlib
import logging
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict
logger = logging.getLogger(__name__)
# Cache directories
CACHE_DIR = Path(__file__).resolve().parent.parent / "cache"
SEARCH_CACHE_DIR = CACHE_DIR / "search"
CONTENT_CACHE_DIR = CACHE_DIR / "content"
CACHE_MAX_ENTRIES = 1000
# Create cache directories
SEARCH_CACHE_DIR.mkdir(parents=True, exist_ok=True)
CONTENT_CACHE_DIR.mkdir(parents=True, exist_ok=True)
# Track cache size for LRU eviction
search_cache_index: Dict[str, datetime] = {}
content_cache_index: Dict[str, datetime] = {}
# Cache metrics (shared across modules)
cache_metrics = {"hits": 0, "misses": 0, "evictions": 0}
def generate_cache_key(data: str) -> str:
"""Generate a unique cache key using SHA-256 hash."""
return hashlib.sha256(data.encode("utf-8")).hexdigest()
def cleanup_cache(cache_dir: Path, cache_index: Dict[str, datetime], max_age: timedelta):
"""Remove expired cache entries and enforce LRU policy."""
current_time = datetime.now()
files_in_dir = {f.name.split(".")[0]: f for f in cache_dir.glob("*.cache")}
to_remove = []
for key, timestamp in list(cache_index.items()):
if current_time - timestamp > max_age or key not in files_in_dir:
to_remove.append(key)
if key in files_in_dir:
files_in_dir[key].unlink(missing_ok=True)
for key in to_remove:
cache_index.pop(key, None)
cache_metrics["evictions"] += 1
if len(cache_index) > CACHE_MAX_ENTRIES:
sorted_items = sorted(cache_index.items(), key=lambda x: x[1])
excess_count = len(cache_index) - CACHE_MAX_ENTRIES
for key, _ in sorted_items[:excess_count]:
cache_index.pop(key, None)
cache_file = cache_dir / f"{key}.cache"
cache_file.unlink(missing_ok=True)
cache_metrics["evictions"] += 1
+360
View File
@@ -0,0 +1,360 @@
"""Webpage content fetching with caching, PDF extraction, and summarization helpers."""
import io
import ipaddress
import json
import os
import re
import logging
import socket
from datetime import datetime, timedelta
from typing import List
from urllib.parse import urljoin, urlparse
import httpx
from bs4 import BeautifulSoup
from .analytics import RateLimitError, error_logger
from .cache import (
CONTENT_CACHE_DIR,
content_cache_index,
generate_cache_key,
cleanup_cache,
)
logger = logging.getLogger(__name__)
_PRIVATE_NETWORKS = (
ipaddress.ip_network("0.0.0.0/8"),
ipaddress.ip_network("10.0.0.0/8"),
ipaddress.ip_network("127.0.0.0/8"),
ipaddress.ip_network("169.254.0.0/16"),
ipaddress.ip_network("172.16.0.0/12"),
ipaddress.ip_network("192.168.0.0/16"),
ipaddress.ip_network("::1/128"),
ipaddress.ip_network("fc00::/7"),
ipaddress.ip_network("fe80::/10"),
)
def _is_private_address(addr: ipaddress._BaseAddress) -> bool:
return addr.is_private or addr.is_loopback or addr.is_link_local or any(addr in net for net in _PRIVATE_NETWORKS)
def _resolve_hostname_ips(hostname: str) -> list[ipaddress._BaseAddress]:
try:
infos = socket.getaddrinfo(hostname, None)
except Exception:
return []
out = []
for info in infos:
try:
out.append(ipaddress.ip_address(info[4][0]))
except Exception:
continue
return out
def _public_http_url(url: str) -> bool:
try:
parsed = urlparse(url)
if parsed.scheme not in ("http", "https"):
return False
host = (parsed.hostname or "").strip()
if not host:
return False
lower = host.lower()
if lower in ("localhost", "metadata", "metadata.google.internal"):
return False
if lower.endswith((".local", ".localhost", ".internal", ".lan", ".intranet")):
return False
try:
return not _is_private_address(ipaddress.ip_address(host))
except ValueError:
pass
addrs = _resolve_hostname_ips(host)
return bool(addrs) and not any(_is_private_address(a) for a in addrs)
except Exception:
return False
def _get_public_url(url: str, headers: dict, timeout: int, max_redirects: int = 5) -> httpx.Response:
current = url
for _ in range(max_redirects + 1):
if not _public_http_url(current):
raise httpx.RequestError("Blocked private/internal URL", request=httpx.Request("GET", current))
response = httpx.get(current, headers=headers, timeout=timeout, follow_redirects=False)
if response.status_code not in (301, 302, 303, 307, 308):
return response
location = response.headers.get("location")
if not location:
return response
current = urljoin(str(response.url), location)
raise httpx.RequestError("Too many redirects", request=httpx.Request("GET", current))
# PDF extraction (optional dependency)
try:
from pdfminer.high_level import extract_text as pdf_extract_text
except ImportError:
pdf_extract_text = None # type: ignore
# ----------------------------------------------------------------------
# HTML extraction helpers
# ----------------------------------------------------------------------
def _extract_meta(soup: BeautifulSoup) -> dict:
"""Pull meta description and keywords if present."""
description = ""
keywords = ""
desc_tag = soup.find("meta", attrs={"name": re.compile("description", re.I)})
if desc_tag and desc_tag.get("content"):
description = desc_tag["content"].strip()
kw_tag = soup.find("meta", attrs={"name": re.compile("keywords", re.I)})
if kw_tag and kw_tag.get("content"):
keywords = kw_tag["content"].strip()
return {"description": description, "keywords": keywords}
def _extract_lists(soup: BeautifulSoup) -> List[List[str]]:
"""Return a list of lists, each inner list representing a <ul>/<ol>."""
all_lists = []
for lst in soup.find_all(["ul", "ol"]):
items = [li.get_text(separator=" ", strip=True) for li in lst.find_all("li")]
if items:
all_lists.append(items)
return all_lists
def _extract_tables(soup: BeautifulSoup) -> List[List[List[str]]]:
"""Return a list of tables, each table is a list of rows, each row a list of cell texts."""
tables_data = []
for table in soup.find_all("table"):
rows = []
for tr in table.find_all("tr"):
cells = [td.get_text(separator=" ", strip=True) for td in tr.find_all(["td", "th"])]
if cells:
rows.append(cells)
if rows:
tables_data.append(rows)
return tables_data
def _extract_code_blocks(soup: BeautifulSoup) -> List[str]:
"""Collect text from <pre> and <code> blocks."""
blocks = []
for tag in soup.find_all(["pre", "code"]):
txt = tag.get_text(separator=" ", strip=True)
if txt:
blocks.append(txt)
return blocks
def _detect_js_frameworks(soup: BeautifulSoup) -> bool:
"""Very naive detection of common JS frameworks."""
js_indicators = [
"react", "angular", "vue", "svelte", "next", "nuxt",
"ember", "backbone", "jquery", "polymer", "mithril",
]
for script in soup.find_all("script"):
src = script.get("src", "").lower()
if any(fr in src for fr in js_indicators):
return True
if script.string:
content = script.string.lower()
if any(fr in content for fr in js_indicators):
return True
if soup.find(attrs={"data-reactroot": True}) or soup.find(attrs={"ng-app": True}):
return True
return False
def _empty_result(url: str, error: str = "") -> dict:
"""Build a standard failure result dict."""
return {
"url": url,
"title": "",
"content": "",
"lists": [],
"tables": [],
"code_blocks": [],
"meta_description": "",
"meta_keywords": "",
"js_rendered": False,
"js_message": "",
"success": False,
"error": error,
}
# ----------------------------------------------------------------------
# Main content fetcher
# ----------------------------------------------------------------------
def fetch_webpage_content(url: str, timeout: int = 5, retry_attempt: int = 0) -> dict:
"""Fetch and extract meaningful content from a webpage with caching."""
cache_key = generate_cache_key(url)
cache_file = CONTENT_CACHE_DIR / f"{cache_key}.cache"
# Check cache
if cache_file.exists():
try:
with open(cache_file, "r", encoding="utf-8") as f:
cached_data = json.load(f)
timestamp = datetime.fromisoformat(cached_data["timestamp"])
if datetime.now() - timestamp < timedelta(hours=2):
logger.debug(f"Content cache hit for URL: {url}")
return cached_data["data"]
else:
cache_file.unlink(missing_ok=True)
content_cache_index.pop(cache_key, None)
except Exception as e:
logger.warning(f"Failed to read content cache for {url}: {e}")
cache_file.unlink(missing_ok=True)
content_cache_index.pop(cache_key, None)
# Fetch
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
}
response = _get_public_url(url, headers=headers, timeout=timeout)
if response.status_code == 429:
raise RateLimitError(f"Rate limit hit for {url} (attempt {retry_attempt})")
response.raise_for_status()
except httpx.RequestError as e:
error_logger.error(f"NetworkError fetching {url} (attempt {retry_attempt}): {e}")
return _empty_result(url, f"NetworkError: {e}")
except RateLimitError as e:
error_logger.error(str(e))
return _empty_result(url, str(e))
# PDF handling
content_type = response.headers.get("Content-Type", "").lower()
if "application/pdf" in content_type or url.lower().endswith(".pdf"):
if pdf_extract_text is None:
logger.error("pdfminer.six is not installed; cannot extract PDF text.")
pdf_text = ""
else:
try:
pdf_bytes = io.BytesIO(response.content)
pdf_text = pdf_extract_text(pdf_bytes)
except Exception as e:
logger.warning(f"PDF extraction failed for {url}: {e}")
pdf_text = ""
result = {
"url": url,
"title": os.path.basename(url),
"content": pdf_text,
"lists": [],
"tables": [],
"code_blocks": [],
"meta_description": "",
"meta_keywords": "",
"js_rendered": False,
"js_message": "",
"success": bool(pdf_text),
"error": "" if pdf_text else "Failed to extract PDF text",
}
_cache_result(cache_file, cache_key, result, url)
return result
# HTML handling
try:
soup = BeautifulSoup(response.text, "html.parser")
except Exception as e:
error_logger.error(f"ParseError parsing HTML from {url} (attempt {retry_attempt}): {e}")
result = _empty_result(url, f"ParseError: {e}")
_cache_result(cache_file, cache_key, result, url)
return result
title_tag = soup.find("title")
title_text = title_tag.get_text(strip=True) if title_tag else ""
meta_info = _extract_meta(soup)
js_rendered = _detect_js_frameworks(soup)
js_message = "Page appears to be rendered by a JavaScript framework; content may be incomplete." if js_rendered else ""
# Main textual content (heuristic)
main_content = ""
content_areas = soup.find_all(
["main", "article", "section", "div"],
class_=re.compile("content|main|body|article|post|entry|text", re.I),
)
if content_areas:
for area in content_areas[:3]:
main_content += area.get_text(separator=" ", strip=True) + " "
if not main_content:
body = soup.find("body")
if body:
main_content = body.get_text(separator=" ", strip=True)
main_content = re.sub(r"\s+", " ", main_content).strip()[:8000]
result = {
"url": url,
"title": title_text,
"content": main_content,
"lists": _extract_lists(soup),
"tables": _extract_tables(soup),
"code_blocks": _extract_code_blocks(soup),
"meta_description": meta_info.get("description", ""),
"meta_keywords": meta_info.get("keywords", ""),
"js_rendered": js_rendered,
"js_message": js_message,
"success": True,
"error": "",
}
_cache_result(cache_file, cache_key, result, url)
return result
def _cache_result(cache_file, cache_key: str, result: dict, url: str):
"""Write a result to the content cache."""
try:
cache_data = {"timestamp": datetime.now().isoformat(), "data": result}
with open(cache_file, "w", encoding="utf-8") as f:
json.dump(cache_data, f)
content_cache_index[cache_key] = datetime.now()
cleanup_cache(CONTENT_CACHE_DIR, content_cache_index, timedelta(hours=2))
except Exception as e:
logger.warning(f"Failed to write content cache for {url}: {e}")
# ----------------------------------------------------------------------
# Content summarization helpers
# ----------------------------------------------------------------------
def extract_key_points(text: str) -> List[str]:
"""Pull out bullet-style key points from a block of text."""
points: List[str] = []
bullet_pat = re.compile(r"^\s*[-*•]\s+(.*)")
numbered_pat = re.compile(r"^\s*\d+[\.\)]\s+(.*)")
for line in text.splitlines():
m = bullet_pat.match(line) or numbered_pat.match(line)
if m:
points.append(m.group(1).strip())
return points
def get_tldr(text: str, max_sentences: int = 3) -> str:
"""Produce a very short TL;DR by taking the first few sentences."""
sentences = re.split(r"(?<=[.!?])\s+", text)
selected = [s.strip() for s in sentences if s][:max_sentences]
return " ".join(selected)
def extract_quotes(text: str) -> List[str]:
"""Return quoted excerpts that are at least 15 characters long."""
return [m.group(1).strip() for m in re.finditer(r'["\']([^"\']{15,}?)["\']', text)]
def extract_statistics(text: str) -> List[str]:
"""Find numbers, percentages, dates and simple measurements."""
pattern = re.compile(
r"\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\s*(%|percent|‰|per cent|[a-zA-Z]+)?\b",
re.IGNORECASE,
)
return [m.group(0).strip() for m in pattern.finditer(text)]
+433
View File
@@ -0,0 +1,433 @@
"""Core search orchestrators: searxng_search_results, comprehensive_web_search, config, cache invalidation."""
import json
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime, timedelta
from typing import Dict, Any, Optional, List, Set
from urllib.parse import urlparse
from .analytics import (
NetworkError,
ParseError,
RateLimitError,
error_logger,
_record_query,
)
from .cache import (
SEARCH_CACHE_DIR,
search_cache_index,
generate_cache_key,
cleanup_cache,
)
from .query import _cache_duration_for_query
from .ranking import rank_search_results
from .providers import (
searxng_search_api,
brave_search,
duckduckgo_search,
google_pse_search,
tavily_search,
serper_search,
_get_search_settings,
_get_result_count,
)
from .content import (
fetch_webpage_content,
extract_key_points,
get_tldr,
extract_quotes,
extract_statistics,
)
logger = logging.getLogger(__name__)
# ========= CONFIG =========
SEARCH_CONFIG: Dict[str, Any] = {
"primary_provider": "searxng",
}
def get_search_config() -> Dict[str, Any]:
"""Get current search configuration including active provider info."""
config = SEARCH_CONFIG.copy()
settings = _get_search_settings()
provider = settings.get("search_provider", "searxng")
config["active_provider"] = provider
config["has_api_key"] = bool((settings.get("search_api_key") or "").strip())
config["result_count"] = _get_result_count()
if provider == "searxng":
from .providers import _get_search_instance
config["search_url"] = _get_search_instance()
return config
def update_search_config(api_key: str = None, **kwargs):
"""Update search configuration (e.g. Brave API key)."""
if api_key:
SEARCH_CONFIG["brave_api_key"] = api_key
def _call_provider(provider_name: str, query: str, count: int, time_filter: str = None) -> List[dict]:
"""Call a search provider by name. Returns list of results or empty list."""
if provider_name == "searxng":
return searxng_search_api(query, count, time_filter=time_filter)
elif provider_name == "brave":
return brave_search(query, count, time_filter)
elif provider_name == "duckduckgo":
return duckduckgo_search(query, count, time_filter)
elif provider_name == "google_pse":
return google_pse_search(query, count, time_filter)
elif provider_name == "tavily":
return tavily_search(query, count, time_filter)
elif provider_name == "serper":
return serper_search(query, count, time_filter)
return []
# If the self-hosted SearXNG instance is up but all enabled engines return
# empty, fall back to the no-key provider so "search X" still works on fresh
# installs. Users can override/disable with `search_fallback_chain`.
_FALLBACK_ORDER = ["duckduckgo"]
def _build_provider_chain(primary: str) -> List[str]:
"""Build ordered list: primary first, then configured/default fallbacks."""
chain = [primary]
settings = _get_search_settings()
user_chain = settings.get("search_fallback_chain") or []
if isinstance(user_chain, str):
user_chain = [s.strip() for s in user_chain.split(",") if s.strip()]
fallbacks = user_chain if user_chain else _FALLBACK_ORDER
for fb in fallbacks:
if fb and fb != primary and fb not in chain and fb != "disabled":
chain.append(fb)
return chain
# ----------------------------------------------------------------------
# Unified search with caching and retry
# ----------------------------------------------------------------------
def searxng_search_results(query: str, count: int = 10, time_filter: str = None) -> list[dict]:
"""Perform a web search using configured provider with caching and retry."""
settings = _get_search_settings()
search_provider = settings.get("search_provider", "searxng")
result_count = _get_result_count()
# Use configured count if caller used default
if count == 10:
count = result_count
cache_key = generate_cache_key(f"{query}|{count}|{time_filter}")
cache_file = SEARCH_CACHE_DIR / f"{cache_key}.cache"
# Check cache
if cache_file.exists():
try:
with open(cache_file, "r", encoding="utf-8") as f:
cached_data = json.load(f)
expiry_raw = cached_data.get("expiry")
expiry = datetime.fromisoformat(expiry_raw) if expiry_raw else None
if expiry and datetime.now() < expiry:
logger.debug(f"Search cache hit for query: {query}")
results = cached_data["data"]
_record_query(query, bool(results), cache_hit=True)
return results
else:
cache_file.unlink(missing_ok=True)
search_cache_index.pop(cache_key, None)
except Exception as e:
logger.warning(f"Failed to read search cache for {query}: {e}")
cache_file.unlink(missing_ok=True)
search_cache_index.pop(cache_key, None)
logger.debug(f"Search cache miss for query: {query}")
if search_provider == "disabled":
logger.info("Search is disabled via admin settings")
return []
provider_chain = _build_provider_chain(search_provider)
results: List[dict] = []
for provider_name in provider_chain:
for attempt in range(2):
try:
logger.info(f"Attempting {provider_name} search (attempt {attempt + 1})")
results = _call_provider(provider_name, query, count, time_filter)
if results:
logger.info(f"{provider_name} search succeeded with {len(results)} results")
break
except (NetworkError, ParseError, RateLimitError) as e:
error_logger.error(f"{provider_name} search error (attempt {attempt + 1}): {e}")
except Exception as e:
error_logger.error(f"Unexpected error during {provider_name} search (attempt {attempt + 1}): {e}")
if results:
break
success = bool(results)
_record_query(query, success, cache_hit=False)
if success:
results = rank_search_results(query, results)
try:
expiry = datetime.now() + _cache_duration_for_query(query)
cache_data = {
"timestamp": datetime.now().isoformat(),
"expiry": expiry.isoformat(),
"data": results,
}
with open(cache_file, "w", encoding="utf-8") as f:
json.dump(cache_data, f)
search_cache_index[cache_key] = datetime.now()
cleanup_cache(SEARCH_CACHE_DIR, search_cache_index, timedelta(hours=1))
except Exception as e:
logger.warning(f"Failed to write search cache for {query}: {e}")
if not success:
logger.error(f"All search providers failed for query: {query}")
return results
# ----------------------------------------------------------------------
# Cache invalidation
# ----------------------------------------------------------------------
def invalidate_search_cache(query: Optional[str] = None) -> None:
"""Invalidate cached search results. None clears all, otherwise just the given query."""
if query is None:
for file in SEARCH_CACHE_DIR.glob("*.cache"):
try:
file.unlink(missing_ok=True)
except Exception as e:
error_logger.warning(f"Failed to delete cache file {file}: {e}")
search_cache_index.clear()
logger.info("All search cache entries have been cleared.")
else:
cache_key = generate_cache_key(f"{query}|10|None")
cache_file = SEARCH_CACHE_DIR / f"{cache_key}.cache"
if cache_file.exists():
try:
cache_file.unlink(missing_ok=True)
search_cache_index.pop(cache_key, None)
logger.info(f"Cache entry for query '{query}' has been invalidated.")
except Exception as e:
error_logger.warning(f"Failed to delete cache file for query '{query}': {e}")
else:
logger.info(f"No cache entry found for query '{query}'.")
# ----------------------------------------------------------------------
# Comprehensive web search (with advanced filtering)
# ----------------------------------------------------------------------
def comprehensive_web_search(
query: str,
max_pages: int = 3,
max_workers: int = 4,
time_filter: str = None,
domain_whitelist: Optional[Set[str]] = None,
domain_blacklist: Optional[Set[str]] = None,
content_type: Optional[str] = None,
language: Optional[str] = None,
min_content_length: int = 0,
return_sources: bool = False,
):
"""Perform comprehensive web search with content fetching and advanced filtering."""
logger.info(f"Starting comprehensive search for: {query}")
if time_filter:
logger.info(f"Applying time filter: {time_filter}")
settings = _get_search_settings()
search_provider = settings.get("search_provider", "searxng")
result_count = _get_result_count()
if search_provider == "disabled":
logger.info("Search is disabled via admin settings")
msg = "Web search is disabled by the administrator."
return (msg, []) if return_sources else msg
# Use configured result count (at least max_pages for content fetching)
fetch_count = max(result_count, max_pages)
provider_chain = _build_provider_chain(search_provider)
search_results = []
provider_attempts = {}
for provider_name in provider_chain:
last_err = None
empty = False
for attempt in range(2):
try:
search_results = _call_provider(provider_name, query, fetch_count, time_filter)
if search_results:
provider_attempts[provider_name] = f"ok ({len(search_results)})"
logger.info(f"Comprehensive search: {provider_name} returned {len(search_results)} results")
break
empty = True
except Exception as e:
last_err = e
logger.warning(f"Comprehensive search: {provider_name} attempt {attempt + 1} failed: {e}")
if search_results:
break
if last_err is not None:
provider_attempts[provider_name] = f"error: {last_err}"
elif empty:
provider_attempts[provider_name] = "empty"
if not search_results:
tally = ", ".join(f"{p}:{r}" for p, r in provider_attempts.items()) or "no providers configured"
any_errors = any(r.startswith("error") for r in provider_attempts.values())
if any_errors:
msg = f"Web search failed — all providers errored or returned empty. Tried: {tally}"
else:
msg = (
f"No search results found. Tried: {tally}. "
"All providers returned empty — possibly a niche query or upstream rate-limiting; "
"rephrasing or using the browser tool for a specific URL may help."
)
logger.warning(msg)
return (msg, []) if return_sources else msg
search_results = rank_search_results(query, search_results)
# URL filter helper
def url_passes_filters(url: str) -> bool:
try:
netloc = urlparse(url).netloc.lower()
except Exception:
return False
if domain_whitelist is not None and netloc not in domain_whitelist:
return False
if domain_blacklist is not None and netloc in domain_blacklist:
return False
if content_type:
ct = content_type.lower()
if ct == "article":
if not any(k in url.lower() for k in ("article", "blog", "news", "post")):
return False
elif ct == "forum":
if not any(k in url.lower() for k in ("forum", "discussion", "thread", "topic")):
return False
elif ct == "academic":
if not any(k in url.lower() for k in ("pdf", "doi", "scholar", "arxiv", "journal", "research")):
return False
if language:
lang_pat = language.lower()
if not (f"/{lang_pat}/" in url.lower() or f"?lang={lang_pat}" in url.lower() or f"&lang={lang_pat}" in url.lower()):
return False
return True
filtered_urls = [r["url"] for r in search_results[:max_pages] if url_passes_filters(r["url"])]
if not filtered_urls:
logger.warning("All URLs filtered out by advanced criteria")
msg = "No suitable results after applying filters."
return (msg, []) if return_sources else msg
# Build sources list for the frontend (before content fetching)
_source_list = [
{"url": r.get("url", ""), "title": r.get("title", "")}
for r in search_results if r.get("url")
]
# Fetch content in parallel
fetched_content = []
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_url = {
executor.submit(fetch_webpage_content, url, 8, retry_attempt=0): url
for url in filtered_urls
}
for future in as_completed(future_to_url):
url = future_to_url[future]
try:
result = future.result()
if result["success"] and result["content"] and len(result["content"]) >= min_content_length:
fetched_content.append(result)
except Exception as e:
logger.error(f"Exception while fetching {url}: {str(e)}")
logger.info(f"Successfully fetched content from {len(fetched_content)} pages")
# Format results
output_parts = []
if search_results:
output_parts.append("```sources")
for i, result in enumerate(search_results, 1):
output_parts.append(f"[{i}] {result['title']}")
output_parts.append(f" {result['url']}")
if result.get("age"):
output_parts.append(f" {result['age']}")
output_parts.append("```")
output_parts.append("")
output_parts.append("=" * 70)
output_parts.append("WEB SEARCH RESULTS AND FETCHED CONTENT")
output_parts.append(f"Query: {query}")
output_parts.append(f"Searched {len(search_results)} results, fetched {len(fetched_content)} pages")
output_parts.append("=" * 70)
output_parts.append("")
output_parts.append("SEARCH RESULTS SUMMARY:")
output_parts.append("-" * 50)
for i, result in enumerate(search_results, 1):
output_parts.append(f"\n[{i}] {result['title']}")
output_parts.append(f" URL: {result['url']}")
output_parts.append(f" Snippet: {result['snippet'][:200]}...")
if result.get("age"):
output_parts.append(f" Age: {result['age']}")
if fetched_content:
output_parts.append("\n" + "=" * 70)
output_parts.append("FETCHED PAGE CONTENT:")
output_parts.append("-" * 50)
for i, content in enumerate(fetched_content, 1):
output_parts.append(f"\n[CONTENT {i}] From: {content['url']}")
output_parts.append(f"Title: {content['title']}")
output_parts.append("-" * 30)
text = content["content"][:3000]
if len(content["content"]) > 3000:
text += "... [truncated]"
output_parts.append(text)
key_points = extract_key_points(content["content"])
if key_points:
output_parts.append("\nKey Points:")
for pt in key_points[:5]:
output_parts.append(f"- {pt}")
tldr = get_tldr(content["content"])
if tldr:
output_parts.append("\nTL;DR:")
output_parts.append(tldr)
quotes = extract_quotes(content["content"])
if quotes:
output_parts.append("\nImportant Quotes:")
for q in quotes[:3]:
output_parts.append(f"\u201c{q}\u201d")
stats = extract_statistics(content["content"])
if stats:
output_parts.append("\nData / Statistics:")
for s in stats[:5]:
output_parts.append(f"- {s}")
output_parts.append("")
output_parts.append("=" * 70)
output_parts.append("END OF WEB SEARCH RESULTS")
output_parts.append("=" * 70)
instructions = (
"\n\nIMPORTANT INSTRUCTIONS:\n"
"1. Use the above web search results and fetched content to answer the user's question\n"
"2. Prioritize information from the FETCHED PAGE CONTENT section as it contains actual page data\n"
"3. Cross-reference multiple sources when possible\n"
"4. If the information is time-sensitive, pay attention to the age of the results\n"
"5. Be explicit if the search results don't contain sufficient information to fully answer the question"
)
output_parts.append(instructions)
result = "\n".join(output_parts)
return (result, _source_list) if return_sources else result
+527
View File
@@ -0,0 +1,527 @@
"""Search provider implementations: SearXNG, Brave, DuckDuckGo, Google PSE, Tavily, Serper."""
import json
import logging
import os
from typing import List, Optional
import httpx
from bs4 import BeautifulSoup
from src.constants import SEARXNG_INSTANCE
from .analytics import RateLimitError, error_logger
from .query import build_enhanced_query
logger = logging.getLogger(__name__)
REQUEST_TIMEOUT = 20
# Provider registry — maps setting value to (label, needs_key, needs_url)
PROVIDER_INFO = {
"searxng": ("SearXNG", False, True),
"brave": ("Brave Search", True, False),
"duckduckgo": ("DuckDuckGo", False, False),
"google_pse": ("Google PSE", True, False),
"tavily": ("Tavily", True, False),
"serper": ("Serper", True, False),
"disabled": ("Disabled", False, False),
}
# ── Settings helpers ──
def _get_search_settings() -> dict:
"""Return search settings from admin config, falling back to env defaults."""
try:
from src.settings import load_settings
return load_settings()
except Exception:
return {}
def _get_search_instance() -> str:
"""Return the active search API URL from admin settings, falling back to env var."""
settings = _get_search_settings()
url = (settings.get("search_url") or "").strip()
if url:
return url.rstrip("/")
return SEARXNG_INSTANCE
def _get_provider_key(provider: str) -> str:
"""Return the API key for a specific provider, with legacy fallback."""
settings = _get_search_settings()
key_map = {
"brave": "brave_api_key",
"google_pse": "google_pse_key",
"tavily": "tavily_api_key",
"serper": "serper_api_key",
}
field = key_map.get(provider, "")
if field:
val = (settings.get(field) or "").strip()
if val:
return val
# Legacy fallback: old shared search_api_key field
return (settings.get("search_api_key") or "").strip()
def _get_result_count() -> int:
"""Return configured result count, default 5."""
settings = _get_search_settings()
try:
return int(settings.get("search_result_count", 5))
except (ValueError, TypeError):
return 5
# ── SearXNG ──
_NEWS_HINTS = ("news", "nyheter", "headlines", "breaking", "latest", "today", "idag")
# Default general engines (google/duckduckgo/brave/startpage/wikipedia) are
# routinely rate-limited / CAPTCHA-blocked on this instance and return nothing.
# Pin engines that actually respond so non-news queries get results without any
# third-party API fallback. Override via SEARXNG_GENERAL_ENGINES.
_GENERAL_ENGINES = os.environ.get("SEARXNG_GENERAL_ENGINES", "bing,mojeek,presearch")
def searxng_search_api(query: str, count: int = 10, categories: str = "general",
time_filter: Optional[str] = None) -> List[dict]:
"""Search using SearXNG JSON API. Returns list of {title, url, snippet}."""
instance = _get_search_instance()
api_key = ""
headers = {"User-Agent": "Mozilla/5.0"}
if api_key:
headers["Authorization"] = f"Bearer {api_key}"
# News/fresh queries do badly in the 'general' category — it favours
# encyclopedic/tourism pages, ignores recency, and (with no language pin)
# bleeds in foreign-language results. When the agent layer detected
# freshness (time_filter) or the query reads like a news lookup, switch to
# the 'news' category, constrain recency, and pin language to English so a
# search like "Canada latest news" returns actual news instead of Wikipedia.
# Pin English for ALL searches — without it, SearXNG geolocates / mixes
# languages and brand-ambiguous terms bleed in foreign SEO pages (e.g.
# "Odyssey" → Honda Japan, "Trojan" → Japanese malware blogs, "Polyphemus"
# → Chinese math forums). The news path already did this; general didn't.
params = {"q": query, "format": "json", "language": "en"}
q_lc = query.lower()
is_news = time_filter is not None or any(h in q_lc for h in _NEWS_HINTS)
if is_news and categories == "general":
params["categories"] = "news"
if time_filter in ("day", "week", "month", "year"):
# 'day' is too sparse on most SearXNG news engines — widen to a week
# so there's enough volume; the news category already biases recent.
params["time_range"] = "week" if time_filter in ("day", "week") else time_filter
else:
params["categories"] = categories
# Route general queries to engines that aren't blocked (default general
# set returns 0 on this instance — see _GENERAL_ENGINES).
if categories == "general" and _GENERAL_ENGINES:
params["engines"] = _GENERAL_ENGINES
try:
def _parse_results(results):
return [
{
"title": r.get("title", ""),
"url": r.get("url", ""),
"snippet": r.get("content", ""),
}
for r in results[:count]
if r.get("url")
]
def _run(search_params):
response = httpx.get(
f"{instance}/search",
params=search_params,
headers=headers or None,
timeout=15,
)
response.raise_for_status()
data = response.json()
return _parse_results(data.get("results", [])), data
active_params = params
parsed, data = _run(active_params)
if not parsed and is_news and categories == "general":
# Some self-hosted SearXNG configs have no working news engines.
# Fall back to the known-good general engines before reporting an
# empty search, otherwise common queries like "Canada news" fail.
fallback = {
"q": query,
"format": "json",
"language": "en",
"categories": "general",
}
if _GENERAL_ENGINES:
fallback["engines"] = _GENERAL_ENGINES
logger.info(
"SearXNG news search returned 0 results for %r; retrying general engines",
query,
)
active_params = fallback
parsed, data = _run(active_params)
if not parsed and active_params.get("language"):
fallback = dict(active_params)
fallback.pop("language", None)
logger.info(
"SearXNG language-pinned search returned 0 results for %r; retrying without language",
query,
)
active_params = fallback
parsed, data = _run(active_params)
if not parsed and active_params.get("engines"):
fallback = dict(active_params)
fallback.pop("engines", None)
logger.info(
"SearXNG pinned engines returned 0 results for %r; retrying default engines",
query,
)
parsed, data = _run(fallback)
logger.info(f"SearXNG JSON API returned {len(parsed)} results for: {query}")
if not parsed:
unresponsive = data.get("unresponsive_engines") if isinstance(data, dict) else None
if unresponsive:
logger.info(f"SearXNG unresponsive engines for {query!r}: {unresponsive}")
return parsed
except Exception as e:
logger.warning(f"SearXNG JSON API search failed: {e}")
html_results = searxng_search(query, max_results=count)
if html_results:
logger.info(f"SearXNG HTML fallback returned {len(html_results)} results for: {query}")
return html_results
def searxng_search(query, max_results=10):
"""Search using SearXNG instance - parsing HTML."""
instance = _get_search_instance()
api_key = ""
req_headers = {"User-Agent": "Mozilla/5.0"}
if api_key:
req_headers["Authorization"] = f"Bearer {api_key}"
try:
response = httpx.get(
f"{instance}/search",
params={"q": query},
headers=req_headers,
timeout=10,
)
if response.is_success:
soup = BeautifulSoup(response.text, "html.parser")
results = []
for article in soup.select("article.result")[:max_results]:
title_elem = article.select_one("h3 a")
if not title_elem:
continue
title = title_elem.get_text(strip=True)
url = title_elem.get("href", "")
snippet_elem = article.select_one("p.content")
snippet = snippet_elem.get_text(strip=True) if snippet_elem else ""
results.append({"title": title, "url": url, "snippet": snippet})
logger.info(f"SearXNG search (HTML) returned {len(results)} results")
return results
except Exception as e:
logger.error(f"SearXNG search failed: {e}")
return []
# ── Brave ──
def brave_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
"""Search using Brave API with key from admin settings or env var."""
api_key = _get_provider_key("brave") or os.environ.get("DATA_BRAVE_API_KEY") or ""
return _brave_search_impl(query, count, time_filter, search_config={"brave_api_key": api_key})
def _brave_search_impl(query: str, count: int, time_filter: Optional[str] = None, search_config: dict = None) -> List[dict]:
"""Core Brave API call. Returns a list of result dicts or an empty list on failure."""
enhanced_query = build_enhanced_query(query, time_filter)
config = search_config or {}
brave_api_key = config.get("brave_api_key")
if not brave_api_key:
brave_api_key = os.environ.get("DATA_BRAVE_API_KEY")
if not brave_api_key:
logger.warning("Brave API key not found, returning empty results for fallback")
return []
headers = {"X-Subscription-Token": brave_api_key, "Accept": "application/json"}
params = {"q": enhanced_query, "count": count}
if time_filter:
time_map = {"day": "day", "week": "week", "month": "month", "year": "year"}
if time_filter in time_map:
params["freshness"] = time_map[time_filter]
logger.info(f"Executing Brave search with query: {enhanced_query}")
try:
response = httpx.get(
"https://api.search.brave.com/res/v1/web/search",
headers=headers,
params=params,
timeout=REQUEST_TIMEOUT,
)
if response.status_code == 429:
raise RateLimitError("Brave rate limit hit")
response.raise_for_status()
except httpx.RequestError as e:
error_logger.error(f"NetworkError during Brave search: {e}")
return []
except RateLimitError as e:
error_logger.error(str(e))
return []
try:
data = response.json()
except json.JSONDecodeError as e:
logger.error(f"Failed to parse Brave API response: {e}")
return []
results = []
if "web" in data and "results" in data["web"]:
for item in data["web"]["results"][:count]:
url = item.get("url", "")
if not url:
continue
results.append({
"title": item.get("title", ""),
"url": url,
"snippet": item.get("description", "") or item.get("content", ""),
"age": item.get("date", "") if item.get("date") else "",
})
logger.info(f"Brave search returned {len(results)} results")
return results
# ── DuckDuckGo (free, no key) ──
def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
"""Search using DuckDuckGo via the duckduckgo-search library. No API key needed."""
def _html_fallback() -> List[dict]:
try:
response = httpx.get(
"https://html.duckduckgo.com/html/",
params={"q": query},
headers={"User-Agent": "Mozilla/5.0"},
timeout=REQUEST_TIMEOUT,
)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
parsed = []
for result in soup.select(".result")[:count]:
link = result.select_one(".result__a")
if not link:
continue
url = link.get("href", "")
if not url:
continue
snippet_el = result.select_one(".result__snippet")
parsed.append({
"title": link.get_text(" ", strip=True),
"url": url,
"snippet": snippet_el.get_text(" ", strip=True) if snippet_el else "",
})
logger.info(f"DuckDuckGo HTML search returned {len(parsed)} results")
return parsed
except Exception as e:
logger.warning(f"DuckDuckGo HTML search failed: {e}")
return []
try:
from duckduckgo_search import DDGS
except ImportError:
logger.warning("duckduckgo-search package not installed; using HTML fallback")
return _html_fallback()
timelimit = None
if time_filter:
time_map = {"day": "d", "week": "w", "month": "m", "year": "y"}
timelimit = time_map.get(time_filter)
try:
ddgs = DDGS()
raw = ddgs.text(query, max_results=count, timelimit=timelimit)
results = []
for item in raw:
url = item.get("href", "")
if not url:
continue
results.append({
"title": item.get("title", ""),
"url": url,
"snippet": item.get("body", ""),
})
logger.info(f"DuckDuckGo search returned {len(results)} results")
return results or _html_fallback()
except Exception as e:
logger.warning(f"DuckDuckGo search failed: {e}")
return _html_fallback()
# ── Google Programmable Search Engine ──
def google_pse_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
"""Search using Google PSE (Custom Search JSON API).
Requires two keys in settings:
- search_api_key: Google API key
- google_pse_cx: Programmable Search Engine ID (cx)
Or env vars GOOGLE_API_KEY and GOOGLE_PSE_CX.
"""
settings = _get_search_settings()
api_key = _get_provider_key("google_pse") or os.environ.get("GOOGLE_API_KEY", "")
cx = (settings.get("google_pse_cx") or "").strip() or os.environ.get("GOOGLE_PSE_CX", "")
if not api_key or not cx:
logger.warning("Google PSE: missing API key or CX ID")
return []
params = {
"key": api_key,
"cx": cx,
"q": query,
"num": min(count, 10), # Google PSE max is 10 per request
}
if time_filter:
# dateRestrict: d[number], w[number], m[number], y[number]
time_map = {"day": "d1", "week": "w1", "month": "m1", "year": "y1"}
if time_filter in time_map:
params["dateRestrict"] = time_map[time_filter]
try:
response = httpx.get(
"https://www.googleapis.com/customsearch/v1",
params=params,
timeout=REQUEST_TIMEOUT,
)
if response.status_code == 429:
raise RateLimitError("Google PSE rate limit hit")
response.raise_for_status()
data = response.json()
except httpx.RequestError as e:
error_logger.error(f"Google PSE search failed: {e}")
return []
except RateLimitError as e:
error_logger.error(str(e))
return []
results = []
for item in data.get("items", [])[:count]:
url = item.get("link", "")
if not url:
continue
results.append({
"title": item.get("title", ""),
"url": url,
"snippet": item.get("snippet", ""),
})
logger.info(f"Google PSE returned {len(results)} results")
return results
# ── Tavily ──
def tavily_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
"""Search using Tavily API. Requires search_api_key or TAVILY_API_KEY env var."""
api_key = _get_provider_key("tavily") or os.environ.get("TAVILY_API_KEY", "")
if not api_key:
logger.warning("Tavily: no API key configured")
return []
payload = {
"query": query,
"max_results": count,
"include_answer": False,
}
if time_filter:
time_map = {"day": "day", "week": "week", "month": "month", "year": "year"}
if time_filter in time_map:
payload["days"] = {"day": 1, "week": 7, "month": 30, "year": 365}[time_filter]
try:
response = httpx.post(
"https://api.tavily.com/search",
json=payload,
headers={"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"},
timeout=REQUEST_TIMEOUT,
)
if response.status_code == 429:
raise RateLimitError("Tavily rate limit hit")
response.raise_for_status()
data = response.json()
except httpx.RequestError as e:
error_logger.error(f"Tavily search failed: {e}")
return []
except RateLimitError as e:
error_logger.error(str(e))
return []
results = []
for item in data.get("results", [])[:count]:
url = item.get("url", "")
if not url:
continue
results.append({
"title": item.get("title", ""),
"url": url,
"snippet": item.get("content", ""),
"age": item.get("published_date", ""),
})
logger.info(f"Tavily returned {len(results)} results")
return results
# ── Serper.dev ──
def serper_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
"""Search using Serper.dev API. Requires search_api_key or SERPER_API_KEY env var."""
api_key = _get_provider_key("serper") or os.environ.get("SERPER_API_KEY", "")
if not api_key:
logger.warning("Serper: no API key configured")
return []
payload = {
"q": query,
"num": count,
}
if time_filter:
time_map = {"day": "qdr:d", "week": "qdr:w", "month": "qdr:m", "year": "qdr:y"}
if time_filter in time_map:
payload["tbs"] = time_map[time_filter]
try:
response = httpx.post(
"https://google.serper.dev/search",
json=payload,
headers={"X-API-KEY": api_key, "Content-Type": "application/json"},
timeout=REQUEST_TIMEOUT,
)
if response.status_code == 429:
raise RateLimitError("Serper rate limit hit")
response.raise_for_status()
data = response.json()
except httpx.RequestError as e:
error_logger.error(f"Serper search failed: {e}")
return []
except RateLimitError as e:
error_logger.error(str(e))
return []
results = []
for item in data.get("organic", [])[:count]:
url = item.get("link", "")
if not url:
continue
results.append({
"title": item.get("title", ""),
"url": url,
"snippet": item.get("snippet", ""),
"age": item.get("date", ""),
})
logger.info(f"Serper returned {len(results)} results")
return results
+128
View File
@@ -0,0 +1,128 @@
"""Query enhancement, entity extraction, and cache duration helpers."""
import re
import logging
from datetime import timedelta
from typing import Dict, List, Optional, Tuple
logger = logging.getLogger(__name__)
# ----------------------------------------------------------------------
# Query processing helpers
# ----------------------------------------------------------------------
def _detect_question_type(query: str) -> Optional[str]:
"""Return the leading question word if present (who, what, when, where, why, how)."""
q = query.strip().lower()
for word in ("who", "what", "when", "where", "why", "how"):
if q.startswith(word):
return word
return None
def _extract_entities(query: str) -> Dict[str, List[str]]:
"""Lightweight entity extraction: capitalized words and date patterns."""
entities: Dict[str, List[str]] = {"names": [], "dates": []}
qtype = _detect_question_type(query)
cleaned = query
if qtype:
cleaned = re.sub(rf"^{qtype}\b", "", cleaned, flags=re.I).strip()
for token in re.findall(r"\b[A-Z][a-zA-Z]+\b", cleaned):
entities["names"].append(token)
for year in re.findall(r"\b(19|20)\d{2}\b", cleaned):
entities["dates"].append(year)
month_day_year = re.findall(
r"\b(?:Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|August|Sep|Sept|September|Oct|October|Nov|November|Dec|December)\s+\d{1,2},?\s*\d{4}\b",
cleaned,
flags=re.I,
)
entities["dates"].extend(month_day_year)
return entities
def _split_multi_part(query: str) -> List[str]:
"""Split a query into sub-queries on common conjunctions."""
parts = re.split(r"\s+and\s+|\s+or\s+|;", query, flags=re.I)
return [p.strip() for p in parts if p.strip()]
def _extract_site_filter(query: str) -> Tuple[str, Optional[str]]:
"""Detect a 'site:example.com' token. Returns (query_without_token, site_or_None)."""
match = re.search(r"\bsite:([^\s]+)", query, flags=re.I)
if match:
site = match.group(1)
new_query = re.sub(r"\bsite:[^\s]+", "", query, flags=re.I).strip()
return new_query, site
return query, None
def _boost_entities_in_query(base_query: str, entities: Dict[str, List[str]]) -> str:
"""Append extracted entities to the query using OR to increase relevance."""
parts = [base_query]
if entities.get("names"):
parts.append(" OR ".join(f'"{n}"' for n in entities["names"]))
if entities.get("dates"):
parts.append(" OR ".join(f'"{d}"' for d in entities["dates"]))
return " ".join(parts)
def enhance_query(original_query: str) -> Tuple[str, Optional[str]]:
"""Process the original query: site filter, question type boosts, entity extraction."""
query_without_site, site = _extract_site_filter(original_query)
sub_queries = _split_multi_part(query_without_site)
enhanced_subs: List[str] = []
for sub in sub_queries:
qtype = _detect_question_type(sub)
boost_keywords = []
if qtype == "who":
boost_keywords.append("person")
elif qtype == "when":
boost_keywords.append("date")
elif qtype == "where":
boost_keywords.append("location")
elif qtype == "why":
boost_keywords.append("reason")
elif qtype == "how":
boost_keywords.append("method")
entities = _extract_entities(sub)
boosted = _boost_entities_in_query(sub, entities)
if boost_keywords:
boosted = f'({boosted}) OR ({" OR ".join(boost_keywords)})'
enhanced_subs.append(boosted)
final_query = " AND ".join(f"({s})" for s in enhanced_subs)
if site:
final_query = f"{final_query} site:{site}"
return final_query, site
def build_enhanced_query(query: str, time_filter: str = None) -> str:
"""Build an enhanced search query with optional time filtering."""
enhanced_query, _ = enhance_query(query)
if time_filter:
time_map = {"day": "d", "week": "w", "month": "m", "year": "y"}
if time_filter in time_map:
enhanced_query = f"{enhanced_query} after:{time_map[time_filter]}"
logger.info(f"Added time filter '{time_filter}' to query")
logger.info(f"Enhanced query: '{query}' -> '{enhanced_query}'")
return enhanced_query
# ----------------------------------------------------------------------
# Cache duration helpers
# ----------------------------------------------------------------------
def _is_news_query(query: str) -> bool:
"""Lightweight heuristic to decide if a query is news-oriented."""
news_terms = {"news", "latest", "breaking", "today", "today's", "current", "updates", "happening"}
tokens = set(re.findall(r"\b\w+\b", query.lower()))
return bool(tokens & news_terms)
def _cache_duration_for_query(query: str) -> timedelta:
"""News queries -> 30 minutes, reference queries -> 24 hours."""
if _is_news_query(query):
return timedelta(minutes=30)
return timedelta(hours=24)
+127
View File
@@ -0,0 +1,127 @@
"""Search result ranking based on relevance, source quality, and recency."""
import re
import logging
from datetime import datetime
from typing import List, Optional
from urllib.parse import urlparse
logger = logging.getLogger(__name__)
_NEWS_HINTS = {"news", "nyheter", "headlines", "breaking", "latest", "today", "idag"}
_SPORTS_HINTS = {
"sport", "sports", "soccer", "football", "hockey", "nba", "nfl", "mlb",
"fifa", "world cup", "championship", "quarterfinal", "eliminates",
}
_LOW_VALUE_NEWS_DOMAINS = {
"facebook.com", "www.facebook.com", "sports.yahoo.com", "yahoo.com",
"www.yahoo.com", "msn.com", "www.msn.com",
}
_TRUSTED_NEWS_DOMAINS = {
"apnews.com", "www.apnews.com", "reuters.com", "www.reuters.com",
"bbc.com", "www.bbc.com", "cbc.ca", "www.cbc.ca",
"ctvnews.ca", "www.ctvnews.ca", "globalnews.ca", "www.globalnews.ca",
"theguardian.com",
"www.theguardian.com", "euronews.com", "www.euronews.com",
"dw.com", "www.dw.com", "government.se", "www.government.se",
}
def _domain(url: str) -> str:
try:
return urlparse(url).netloc.lower()
except Exception:
return ""
def rank_search_results(query: str, results: List[dict]) -> List[dict]:
"""Rank search results by title relevance, snippet quality, domain authority, and recency."""
query_terms = [t.lower() for t in re.findall(r"\b\w+\b", query)]
query_lc = query.lower()
is_news_query = any(term in _NEWS_HINTS for term in query_terms)
is_sports_query = any(hint in query_lc for hint in _SPORTS_HINTS)
def title_score(title: str) -> float:
if not title:
return 0.0
title_lc = title.lower()
matches = sum(1 for term in query_terms if re.search(rf"\b{re.escape(term)}\b", title_lc))
return matches / len(query_terms) if query_terms else 0.0
def snippet_score(snippet: str) -> float:
if not snippet:
return 0.0
length_factor = min(len(snippet), 200) / 200
term_hits = sum(1 for term in query_terms if term in snippet.lower())
term_factor = term_hits / len(query_terms) if query_terms else 0.0
return (length_factor + term_factor) / 2
def domain_score(url: str) -> float:
netloc = _domain(url)
if not netloc:
return 0.0
if netloc in _TRUSTED_NEWS_DOMAINS:
return 1.0
if netloc.endswith(".edu") or netloc.endswith(".gov"):
return 1.0
if netloc.endswith(".org"):
return 0.7
return 0.4
def recency_score(age_str: Optional[str]) -> float:
if not age_str:
return 0.0
for fmt in ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S"):
try:
dt = datetime.strptime(age_str, fmt)
break
except Exception:
dt = None
if not dt:
return 0.0
days_old = (datetime.now() - dt).days
if days_old <= 7:
return 1.0
if days_old >= 30:
return 0.0
return (30 - days_old) / 23
def news_quality_adjustment(title: str, snippet: str, url: str) -> float:
if not is_news_query:
return 0.0
text = f"{title} {snippet}".lower()
netloc = _domain(url)
adjustment = 0.0
if netloc in _TRUSTED_NEWS_DOMAINS:
adjustment += 1.2
if any(term in text for term in ("latest news", "breaking news", "daily coverage", "news from")):
adjustment += 0.4
if netloc in _LOW_VALUE_NEWS_DOMAINS:
adjustment -= 0.8
if not is_sports_query and any(hint in text or hint in netloc for hint in _SPORTS_HINTS):
adjustment -= 1.5
# A country/news query should not rank a page whose title/snippet barely
# mentions the country above actual news pages for that country.
subject_terms = [t for t in query_terms if t not in _NEWS_HINTS]
if subject_terms and not any(t in text or t in netloc for t in subject_terms):
adjustment -= 1.0
return adjustment
ranked = []
for result in results:
title = result.get("title", "")
snippet = result.get("snippet", "")
url = result.get("url", "")
age = result.get("age", None)
score = (
2.0 * title_score(title)
+ 1.0 * snippet_score(snippet)
+ 1.5 * domain_score(url)
+ 1.0 * recency_score(age)
+ news_quality_adjustment(title, snippet, url)
)
ranked.append((score, result))
ranked.sort(key=lambda x: x[0], reverse=True)
return [r for _, r in ranked]
+95
View File
@@ -0,0 +1,95 @@
# services/search/service.py
"""Search service — clean interface for web search."""
from dataclasses import dataclass
from typing import List, Optional, Dict, Any
from . import (
comprehensive_web_search,
fetch_webpage_content,
get_search_config,
)
@dataclass
class SearchResult:
"""A single search result."""
url: str
title: str
snippet: str
content: Optional[str] = None
@dataclass
class SearchResponse:
"""Response from a search query."""
query: str
results: List[SearchResult]
total: int
cached: bool = False
class SearchService:
"""
Web search service.
Usage:
service = SearchService()
result = await service.search("python async patterns")
for r in result.results:
print(f"{r.title}: {r.url}")
"""
def __init__(self, default_depth: int = 1, fetch_content: bool = True):
self.default_depth = default_depth
self.fetch_content = fetch_content
async def search(
self,
query: str,
depth: Optional[int] = None,
fetch_content: Optional[bool] = None,
) -> SearchResponse:
"""
Search the web.
Args:
query: Search query
depth: Search depth (1=quick, 2=thorough, 3=comprehensive)
fetch_content: Whether to fetch full page content
Returns:
SearchResponse with results
"""
depth = depth or self.default_depth
fetch_content = fetch_content if fetch_content is not None else self.fetch_content
# Use existing search implementation
raw_results = await comprehensive_web_search(
query,
max_results=10 * depth,
fetch_content=fetch_content,
)
results = []
for r in raw_results:
results.append(SearchResult(
url=r.get("url", ""),
title=r.get("title", ""),
snippet=r.get("snippet", ""),
content=r.get("content"),
))
return SearchResponse(
query=query,
results=results,
total=len(results),
)
async def fetch_content(self, url: str) -> Optional[str]:
"""Fetch content from a URL."""
return await fetch_webpage_content(url)
def get_config(self) -> Dict[str, Any]:
"""Get current search configuration."""
return get_search_config()