Odysseus v1.0

2026-06-16 17:55:26 -04:00 · 2026-05-31 23:58:26 +09:00
commit e5c99a5eee
421 changed files with 271349 additions and 0 deletions
@@ -0,0 +1,35 @@
+"""Search service — web search with SearXNG."""
+
+from .core import (
+    comprehensive_web_search,
+    get_search_config,
+    invalidate_search_cache,
+    searxng_search_results,
+    update_search_config,
+)
+from .content import fetch_webpage_content
+from .providers import searxng_search, searxng_search_api, PROVIDER_INFO
+from .analytics import get_search_stats, SearchEngineError, NetworkError, ParseError, RateLimitError
+from .service import SearchService, SearchResult, SearchResponse
+
+__all__ = [
+    # Service interface (preferred)
+    "SearchService",
+    "SearchResult",
+    "SearchResponse",
+    # Low-level functions (for backwards compat)
+    "comprehensive_web_search",
+    "fetch_webpage_content",
+    "get_search_config",
+    "get_search_stats",
+    "invalidate_search_cache",
+    "searxng_search",
+    "searxng_search_api",
+    "searxng_search_results",
+    "update_search_config",
+    "PROVIDER_INFO",
+    "SearchEngineError",
+    "NetworkError",
+    "ParseError",
+    "RateLimitError",
+]
@@ -0,0 +1,136 @@
+"""Search analytics, metrics tracking, and exception hierarchy."""
+
+import json
+import logging
+from collections import Counter
+from pathlib import Path
+from typing import Dict, Any
+
+from .cache import cache_metrics
+
+logger = logging.getLogger(__name__)
+
+# Dedicated error logger with file handler
+_error_log_path = Path(__file__).resolve().parent.parent / "search_engine_error.log"
+_error_handler = logging.FileHandler(_error_log_path, encoding="utf-8")
+_error_handler.setLevel(logging.WARNING)
+_error_handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(name)s %(message)s"))
+error_logger = logging.getLogger("search_engine_error")
+error_logger.addHandler(_error_handler)
+error_logger.propagate = False
+
+# Analytics file
+ANALYTICS_FILE = Path(__file__).resolve().parent.parent / "search_analytics.json"
+
+
+# ----------------------------------------------------------------------
+# Custom exception hierarchy
+# ----------------------------------------------------------------------
+class SearchEngineError(Exception):
+    """Base class for all search-engine related errors."""
+
+
+class NetworkError(SearchEngineError):
+    """Raised when a network request fails (e.g., timeout, DNS error)."""
+
+
+class ParseError(SearchEngineError):
+    """Raised when HTML or other content cannot be parsed."""
+
+
+class RateLimitError(SearchEngineError):
+    """Raised when the remote service returns a rate-limit (HTTP 429)."""
+
+
+# ----------------------------------------------------------------------
+# Analytics helpers
+# ----------------------------------------------------------------------
+def _load_analytics() -> Dict[str, Any]:
+    """Load analytics data from the JSON file, creating defaults if missing."""
+    if not ANALYTICS_FILE.exists():
+        default = {
+            "total_queries": 0,
+            "successful_queries": 0,
+            "failed_queries": 0,
+            "cache_hits": 0,
+            "cache_misses": 0,
+            "query_patterns": {},
+        }
+        _save_analytics(default)
+        return default
+    try:
+        with open(ANALYTICS_FILE, "r", encoding="utf-8") as f:
+            return json.load(f)
+    except Exception as e:
+        logger.warning(f"Failed to load analytics file: {e}")
+        return {
+            "total_queries": 0,
+            "successful_queries": 0,
+            "failed_queries": 0,
+            "cache_hits": 0,
+            "cache_misses": 0,
+            "query_patterns": {},
+        }
+
+
+def _save_analytics(data: Dict[str, Any]) -> None:
+    """Persist analytics data to the JSON file."""
+    try:
+        with open(ANALYTICS_FILE, "w", encoding="utf-8") as f:
+            json.dump(data, f, indent=2)
+    except Exception as e:
+        logger.warning(f"Failed to write analytics file: {e}")
+
+
+def _record_query(query: str, success: bool, cache_hit: bool) -> None:
+    """Update analytics for a single query execution."""
+    analytics = _load_analytics()
+    analytics["total_queries"] += 1
+    if success:
+        analytics["successful_queries"] += 1
+    else:
+        analytics["failed_queries"] += 1
+
+    if cache_hit:
+        analytics["cache_hits"] += 1
+        cache_metrics["hits"] += 1
+    else:
+        analytics["cache_misses"] += 1
+        cache_metrics["misses"] += 1
+
+    patterns = analytics["query_patterns"]
+    entry = patterns.get(query, {"count": 0, "successes": 0})
+    entry["count"] += 1
+    if success:
+        entry["successes"] += 1
+    patterns[query] = entry
+
+    _save_analytics(analytics)
+
+
+def get_search_stats() -> Dict[str, Any]:
+    """Return aggregated search analytics."""
+    analytics = _load_analytics()
+    total = analytics.get("total_queries", 0) or 1
+    success_rate = analytics.get("successful_queries", 0) / total
+    cache_total = analytics.get("cache_hits", 0) + analytics.get("cache_misses", 0) or 1
+    cache_hit_rate = analytics.get("cache_hits", 0) / cache_total
+
+    pattern_counter = Counter({
+        q: data["count"] for q, data in analytics.get("query_patterns", {}).items()
+    })
+    most_common = [q for q, _ in pattern_counter.most_common(5)]
+
+    return {
+        "most_common_queries": most_common,
+        "success_rate": success_rate,
+        "cache_hit_rate": cache_hit_rate,
+        "total_queries": analytics.get("total_queries", 0),
+        "successful_queries": analytics.get("successful_queries", 0),
+        "failed_queries": analytics.get("failed_queries", 0),
+        "cache_hits": analytics.get("cache_hits", 0),
+        "cache_misses": analytics.get("cache_misses", 0),
+        "cache_evictions": cache_metrics["evictions"],
+        "runtime_cache_hits": cache_metrics["hits"],
+        "runtime_cache_misses": cache_metrics["misses"],
+    }
@@ -0,0 +1,57 @@
+"""Search and content caching with LRU eviction."""
+
+import hashlib
+import logging
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Dict
+
+logger = logging.getLogger(__name__)
+
+# Cache directories
+CACHE_DIR = Path(__file__).resolve().parent.parent / "cache"
+SEARCH_CACHE_DIR = CACHE_DIR / "search"
+CONTENT_CACHE_DIR = CACHE_DIR / "content"
+CACHE_MAX_ENTRIES = 1000
+
+# Create cache directories
+SEARCH_CACHE_DIR.mkdir(parents=True, exist_ok=True)
+CONTENT_CACHE_DIR.mkdir(parents=True, exist_ok=True)
+
+# Track cache size for LRU eviction
+search_cache_index: Dict[str, datetime] = {}
+content_cache_index: Dict[str, datetime] = {}
+
+# Cache metrics (shared across modules)
+cache_metrics = {"hits": 0, "misses": 0, "evictions": 0}
+
+
+def generate_cache_key(data: str) -> str:
+    """Generate a unique cache key using SHA-256 hash."""
+    return hashlib.sha256(data.encode("utf-8")).hexdigest()
+
+
+def cleanup_cache(cache_dir: Path, cache_index: Dict[str, datetime], max_age: timedelta):
+    """Remove expired cache entries and enforce LRU policy."""
+    current_time = datetime.now()
+    files_in_dir = {f.name.split(".")[0]: f for f in cache_dir.glob("*.cache")}
+
+    to_remove = []
+    for key, timestamp in list(cache_index.items()):
+        if current_time - timestamp > max_age or key not in files_in_dir:
+            to_remove.append(key)
+            if key in files_in_dir:
+                files_in_dir[key].unlink(missing_ok=True)
+
+    for key in to_remove:
+        cache_index.pop(key, None)
+        cache_metrics["evictions"] += 1
+
+    if len(cache_index) > CACHE_MAX_ENTRIES:
+        sorted_items = sorted(cache_index.items(), key=lambda x: x[1])
+        excess_count = len(cache_index) - CACHE_MAX_ENTRIES
+        for key, _ in sorted_items[:excess_count]:
+            cache_index.pop(key, None)
+            cache_file = cache_dir / f"{key}.cache"
+            cache_file.unlink(missing_ok=True)
+            cache_metrics["evictions"] += 1
@@ -0,0 +1,360 @@
+"""Webpage content fetching with caching, PDF extraction, and summarization helpers."""
+
+import io
+import ipaddress
+import json
+import os
+import re
+import logging
+import socket
+from datetime import datetime, timedelta
+from typing import List
+from urllib.parse import urljoin, urlparse
+
+import httpx
+from bs4 import BeautifulSoup
+
+from .analytics import RateLimitError, error_logger
+from .cache import (
+    CONTENT_CACHE_DIR,
+    content_cache_index,
+    generate_cache_key,
+    cleanup_cache,
+)
+
+logger = logging.getLogger(__name__)
+
+_PRIVATE_NETWORKS = (
+    ipaddress.ip_network("0.0.0.0/8"),
+    ipaddress.ip_network("10.0.0.0/8"),
+    ipaddress.ip_network("127.0.0.0/8"),
+    ipaddress.ip_network("169.254.0.0/16"),
+    ipaddress.ip_network("172.16.0.0/12"),
+    ipaddress.ip_network("192.168.0.0/16"),
+    ipaddress.ip_network("::1/128"),
+    ipaddress.ip_network("fc00::/7"),
+    ipaddress.ip_network("fe80::/10"),
+)
+
+
+def _is_private_address(addr: ipaddress._BaseAddress) -> bool:
+    return addr.is_private or addr.is_loopback or addr.is_link_local or any(addr in net for net in _PRIVATE_NETWORKS)
+
+
+def _resolve_hostname_ips(hostname: str) -> list[ipaddress._BaseAddress]:
+    try:
+        infos = socket.getaddrinfo(hostname, None)
+    except Exception:
+        return []
+    out = []
+    for info in infos:
+        try:
+            out.append(ipaddress.ip_address(info[4][0]))
+        except Exception:
+            continue
+    return out
+
+
+def _public_http_url(url: str) -> bool:
+    try:
+        parsed = urlparse(url)
+        if parsed.scheme not in ("http", "https"):
+            return False
+        host = (parsed.hostname or "").strip()
+        if not host:
+            return False
+        lower = host.lower()
+        if lower in ("localhost", "metadata", "metadata.google.internal"):
+            return False
+        if lower.endswith((".local", ".localhost", ".internal", ".lan", ".intranet")):
+            return False
+        try:
+            return not _is_private_address(ipaddress.ip_address(host))
+        except ValueError:
+            pass
+        addrs = _resolve_hostname_ips(host)
+        return bool(addrs) and not any(_is_private_address(a) for a in addrs)
+    except Exception:
+        return False
+
+
+def _get_public_url(url: str, headers: dict, timeout: int, max_redirects: int = 5) -> httpx.Response:
+    current = url
+    for _ in range(max_redirects + 1):
+        if not _public_http_url(current):
+            raise httpx.RequestError("Blocked private/internal URL", request=httpx.Request("GET", current))
+        response = httpx.get(current, headers=headers, timeout=timeout, follow_redirects=False)
+        if response.status_code not in (301, 302, 303, 307, 308):
+            return response
+        location = response.headers.get("location")
+        if not location:
+            return response
+        current = urljoin(str(response.url), location)
+    raise httpx.RequestError("Too many redirects", request=httpx.Request("GET", current))
+
+# PDF extraction (optional dependency)
+try:
+    from pdfminer.high_level import extract_text as pdf_extract_text
+except ImportError:
+    pdf_extract_text = None  # type: ignore
+
+
+# ----------------------------------------------------------------------
+# HTML extraction helpers
+# ----------------------------------------------------------------------
+def _extract_meta(soup: BeautifulSoup) -> dict:
+    """Pull meta description and keywords if present."""
+    description = ""
+    keywords = ""
+    desc_tag = soup.find("meta", attrs={"name": re.compile("description", re.I)})
+    if desc_tag and desc_tag.get("content"):
+        description = desc_tag["content"].strip()
+    kw_tag = soup.find("meta", attrs={"name": re.compile("keywords", re.I)})
+    if kw_tag and kw_tag.get("content"):
+        keywords = kw_tag["content"].strip()
+    return {"description": description, "keywords": keywords}
+
+
+def _extract_lists(soup: BeautifulSoup) -> List[List[str]]:
+    """Return a list of lists, each inner list representing a <ul>/<ol>."""
+    all_lists = []
+    for lst in soup.find_all(["ul", "ol"]):
+        items = [li.get_text(separator=" ", strip=True) for li in lst.find_all("li")]
+        if items:
+            all_lists.append(items)
+    return all_lists
+
+
+def _extract_tables(soup: BeautifulSoup) -> List[List[List[str]]]:
+    """Return a list of tables, each table is a list of rows, each row a list of cell texts."""
+    tables_data = []
+    for table in soup.find_all("table"):
+        rows = []
+        for tr in table.find_all("tr"):
+            cells = [td.get_text(separator=" ", strip=True) for td in tr.find_all(["td", "th"])]
+            if cells:
+                rows.append(cells)
+        if rows:
+            tables_data.append(rows)
+    return tables_data
+
+
+def _extract_code_blocks(soup: BeautifulSoup) -> List[str]:
+    """Collect text from <pre> and <code> blocks."""
+    blocks = []
+    for tag in soup.find_all(["pre", "code"]):
+        txt = tag.get_text(separator=" ", strip=True)
+        if txt:
+            blocks.append(txt)
+    return blocks
+
+
+def _detect_js_frameworks(soup: BeautifulSoup) -> bool:
+    """Very naive detection of common JS frameworks."""
+    js_indicators = [
+        "react", "angular", "vue", "svelte", "next", "nuxt",
+        "ember", "backbone", "jquery", "polymer", "mithril",
+    ]
+    for script in soup.find_all("script"):
+        src = script.get("src", "").lower()
+        if any(fr in src for fr in js_indicators):
+            return True
+        if script.string:
+            content = script.string.lower()
+            if any(fr in content for fr in js_indicators):
+                return True
+    if soup.find(attrs={"data-reactroot": True}) or soup.find(attrs={"ng-app": True}):
+        return True
+    return False
+
+
+def _empty_result(url: str, error: str = "") -> dict:
+    """Build a standard failure result dict."""
+    return {
+        "url": url,
+        "title": "",
+        "content": "",
+        "lists": [],
+        "tables": [],
+        "code_blocks": [],
+        "meta_description": "",
+        "meta_keywords": "",
+        "js_rendered": False,
+        "js_message": "",
+        "success": False,
+        "error": error,
+    }
+
+
+# ----------------------------------------------------------------------
+# Main content fetcher
+# ----------------------------------------------------------------------
+def fetch_webpage_content(url: str, timeout: int = 5, retry_attempt: int = 0) -> dict:
+    """Fetch and extract meaningful content from a webpage with caching."""
+    cache_key = generate_cache_key(url)
+    cache_file = CONTENT_CACHE_DIR / f"{cache_key}.cache"
+
+    # Check cache
+    if cache_file.exists():
+        try:
+            with open(cache_file, "r", encoding="utf-8") as f:
+                cached_data = json.load(f)
+            timestamp = datetime.fromisoformat(cached_data["timestamp"])
+            if datetime.now() - timestamp < timedelta(hours=2):
+                logger.debug(f"Content cache hit for URL: {url}")
+                return cached_data["data"]
+            else:
+                cache_file.unlink(missing_ok=True)
+                content_cache_index.pop(cache_key, None)
+        except Exception as e:
+            logger.warning(f"Failed to read content cache for {url}: {e}")
+            cache_file.unlink(missing_ok=True)
+            content_cache_index.pop(cache_key, None)
+
+    # Fetch
+    try:
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+            "Accept-Language": "en-US,en;q=0.5",
+            "Accept-Encoding": "gzip, deflate",
+            "Connection": "keep-alive",
+        }
+        response = _get_public_url(url, headers=headers, timeout=timeout)
+
+        if response.status_code == 429:
+            raise RateLimitError(f"Rate limit hit for {url} (attempt {retry_attempt})")
+
+        response.raise_for_status()
+    except httpx.RequestError as e:
+        error_logger.error(f"NetworkError fetching {url} (attempt {retry_attempt}): {e}")
+        return _empty_result(url, f"NetworkError: {e}")
+    except RateLimitError as e:
+        error_logger.error(str(e))
+        return _empty_result(url, str(e))
+
+    # PDF handling
+    content_type = response.headers.get("Content-Type", "").lower()
+    if "application/pdf" in content_type or url.lower().endswith(".pdf"):
+        if pdf_extract_text is None:
+            logger.error("pdfminer.six is not installed; cannot extract PDF text.")
+            pdf_text = ""
+        else:
+            try:
+                pdf_bytes = io.BytesIO(response.content)
+                pdf_text = pdf_extract_text(pdf_bytes)
+            except Exception as e:
+                logger.warning(f"PDF extraction failed for {url}: {e}")
+                pdf_text = ""
+        result = {
+            "url": url,
+            "title": os.path.basename(url),
+            "content": pdf_text,
+            "lists": [],
+            "tables": [],
+            "code_blocks": [],
+            "meta_description": "",
+            "meta_keywords": "",
+            "js_rendered": False,
+            "js_message": "",
+            "success": bool(pdf_text),
+            "error": "" if pdf_text else "Failed to extract PDF text",
+        }
+        _cache_result(cache_file, cache_key, result, url)
+        return result
+
+    # HTML handling
+    try:
+        soup = BeautifulSoup(response.text, "html.parser")
+    except Exception as e:
+        error_logger.error(f"ParseError parsing HTML from {url} (attempt {retry_attempt}): {e}")
+        result = _empty_result(url, f"ParseError: {e}")
+        _cache_result(cache_file, cache_key, result, url)
+        return result
+
+    title_tag = soup.find("title")
+    title_text = title_tag.get_text(strip=True) if title_tag else ""
+    meta_info = _extract_meta(soup)
+    js_rendered = _detect_js_frameworks(soup)
+    js_message = "Page appears to be rendered by a JavaScript framework; content may be incomplete." if js_rendered else ""
+
+    # Main textual content (heuristic)
+    main_content = ""
+    content_areas = soup.find_all(
+        ["main", "article", "section", "div"],
+        class_=re.compile("content|main|body|article|post|entry|text", re.I),
+    )
+    if content_areas:
+        for area in content_areas[:3]:
+            main_content += area.get_text(separator=" ", strip=True) + " "
+    if not main_content:
+        body = soup.find("body")
+        if body:
+            main_content = body.get_text(separator=" ", strip=True)
+
+    main_content = re.sub(r"\s+", " ", main_content).strip()[:8000]
+
+    result = {
+        "url": url,
+        "title": title_text,
+        "content": main_content,
+        "lists": _extract_lists(soup),
+        "tables": _extract_tables(soup),
+        "code_blocks": _extract_code_blocks(soup),
+        "meta_description": meta_info.get("description", ""),
+        "meta_keywords": meta_info.get("keywords", ""),
+        "js_rendered": js_rendered,
+        "js_message": js_message,
+        "success": True,
+        "error": "",
+    }
+    _cache_result(cache_file, cache_key, result, url)
+    return result
+
+
+def _cache_result(cache_file, cache_key: str, result: dict, url: str):
+    """Write a result to the content cache."""
+    try:
+        cache_data = {"timestamp": datetime.now().isoformat(), "data": result}
+        with open(cache_file, "w", encoding="utf-8") as f:
+            json.dump(cache_data, f)
+        content_cache_index[cache_key] = datetime.now()
+        cleanup_cache(CONTENT_CACHE_DIR, content_cache_index, timedelta(hours=2))
+    except Exception as e:
+        logger.warning(f"Failed to write content cache for {url}: {e}")
+
+
+# ----------------------------------------------------------------------
+# Content summarization helpers
+# ----------------------------------------------------------------------
+def extract_key_points(text: str) -> List[str]:
+    """Pull out bullet-style key points from a block of text."""
+    points: List[str] = []
+    bullet_pat = re.compile(r"^\s*[-*•]\s+(.*)")
+    numbered_pat = re.compile(r"^\s*\d+[\.\)]\s+(.*)")
+    for line in text.splitlines():
+        m = bullet_pat.match(line) or numbered_pat.match(line)
+        if m:
+            points.append(m.group(1).strip())
+    return points
+
+
+def get_tldr(text: str, max_sentences: int = 3) -> str:
+    """Produce a very short TL;DR by taking the first few sentences."""
+    sentences = re.split(r"(?<=[.!?])\s+", text)
+    selected = [s.strip() for s in sentences if s][:max_sentences]
+    return " ".join(selected)
+
+
+def extract_quotes(text: str) -> List[str]:
+    """Return quoted excerpts that are at least 15 characters long."""
+    return [m.group(1).strip() for m in re.finditer(r'["\']([^"\']{15,}?)["\']', text)]
+
+
+def extract_statistics(text: str) -> List[str]:
+    """Find numbers, percentages, dates and simple measurements."""
+    pattern = re.compile(
+        r"\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\s*(%|percent|‰|per cent|[a-zA-Z]+)?\b",
+        re.IGNORECASE,
+    )
+    return [m.group(0).strip() for m in pattern.finditer(text)]
@@ -0,0 +1,433 @@
+"""Core search orchestrators: searxng_search_results, comprehensive_web_search, config, cache invalidation."""
+
+import json
+import logging
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from datetime import datetime, timedelta
+from typing import Dict, Any, Optional, List, Set
+from urllib.parse import urlparse
+
+from .analytics import (
+    NetworkError,
+    ParseError,
+    RateLimitError,
+    error_logger,
+    _record_query,
+)
+from .cache import (
+    SEARCH_CACHE_DIR,
+    search_cache_index,
+    generate_cache_key,
+    cleanup_cache,
+)
+from .query import _cache_duration_for_query
+from .ranking import rank_search_results
+from .providers import (
+    searxng_search_api,
+    brave_search,
+    duckduckgo_search,
+    google_pse_search,
+    tavily_search,
+    serper_search,
+    _get_search_settings,
+    _get_result_count,
+)
+from .content import (
+    fetch_webpage_content,
+    extract_key_points,
+    get_tldr,
+    extract_quotes,
+    extract_statistics,
+)
+
+logger = logging.getLogger(__name__)
+
+# ========= CONFIG =========
+SEARCH_CONFIG: Dict[str, Any] = {
+    "primary_provider": "searxng",
+}
+
+
+def get_search_config() -> Dict[str, Any]:
+    """Get current search configuration including active provider info."""
+    config = SEARCH_CONFIG.copy()
+    settings = _get_search_settings()
+    provider = settings.get("search_provider", "searxng")
+    config["active_provider"] = provider
+    config["has_api_key"] = bool((settings.get("search_api_key") or "").strip())
+    config["result_count"] = _get_result_count()
+    if provider == "searxng":
+        from .providers import _get_search_instance
+        config["search_url"] = _get_search_instance()
+    return config
+
+
+def update_search_config(api_key: str = None, **kwargs):
+    """Update search configuration (e.g. Brave API key)."""
+    if api_key:
+        SEARCH_CONFIG["brave_api_key"] = api_key
+
+
+def _call_provider(provider_name: str, query: str, count: int, time_filter: str = None) -> List[dict]:
+    """Call a search provider by name. Returns list of results or empty list."""
+    if provider_name == "searxng":
+        return searxng_search_api(query, count, time_filter=time_filter)
+    elif provider_name == "brave":
+        return brave_search(query, count, time_filter)
+    elif provider_name == "duckduckgo":
+        return duckduckgo_search(query, count, time_filter)
+    elif provider_name == "google_pse":
+        return google_pse_search(query, count, time_filter)
+    elif provider_name == "tavily":
+        return tavily_search(query, count, time_filter)
+    elif provider_name == "serper":
+        return serper_search(query, count, time_filter)
+    return []
+
+
+# If the self-hosted SearXNG instance is up but all enabled engines return
+# empty, fall back to the no-key provider so "search X" still works on fresh
+# installs. Users can override/disable with `search_fallback_chain`.
+_FALLBACK_ORDER = ["duckduckgo"]
+
+
+def _build_provider_chain(primary: str) -> List[str]:
+    """Build ordered list: primary first, then configured/default fallbacks."""
+    chain = [primary]
+    settings = _get_search_settings()
+    user_chain = settings.get("search_fallback_chain") or []
+    if isinstance(user_chain, str):
+        user_chain = [s.strip() for s in user_chain.split(",") if s.strip()]
+    fallbacks = user_chain if user_chain else _FALLBACK_ORDER
+    for fb in fallbacks:
+        if fb and fb != primary and fb not in chain and fb != "disabled":
+            chain.append(fb)
+    return chain
+
+
+# ----------------------------------------------------------------------
+# Unified search with caching and retry
+# ----------------------------------------------------------------------
+def searxng_search_results(query: str, count: int = 10, time_filter: str = None) -> list[dict]:
+    """Perform a web search using configured provider with caching and retry."""
+    settings = _get_search_settings()
+    search_provider = settings.get("search_provider", "searxng")
+    result_count = _get_result_count()
+    # Use configured count if caller used default
+    if count == 10:
+        count = result_count
+
+    cache_key = generate_cache_key(f"{query}|{count}|{time_filter}")
+    cache_file = SEARCH_CACHE_DIR / f"{cache_key}.cache"
+
+    # Check cache
+    if cache_file.exists():
+        try:
+            with open(cache_file, "r", encoding="utf-8") as f:
+                cached_data = json.load(f)
+            expiry_raw = cached_data.get("expiry")
+            expiry = datetime.fromisoformat(expiry_raw) if expiry_raw else None
+            if expiry and datetime.now() < expiry:
+                logger.debug(f"Search cache hit for query: {query}")
+                results = cached_data["data"]
+                _record_query(query, bool(results), cache_hit=True)
+                return results
+            else:
+                cache_file.unlink(missing_ok=True)
+                search_cache_index.pop(cache_key, None)
+        except Exception as e:
+            logger.warning(f"Failed to read search cache for {query}: {e}")
+            cache_file.unlink(missing_ok=True)
+            search_cache_index.pop(cache_key, None)
+
+    logger.debug(f"Search cache miss for query: {query}")
+
+    if search_provider == "disabled":
+        logger.info("Search is disabled via admin settings")
+        return []
+
+    provider_chain = _build_provider_chain(search_provider)
+
+    results: List[dict] = []
+    for provider_name in provider_chain:
+        for attempt in range(2):
+            try:
+                logger.info(f"Attempting {provider_name} search (attempt {attempt + 1})")
+                results = _call_provider(provider_name, query, count, time_filter)
+                if results:
+                    logger.info(f"{provider_name} search succeeded with {len(results)} results")
+                    break
+            except (NetworkError, ParseError, RateLimitError) as e:
+                error_logger.error(f"{provider_name} search error (attempt {attempt + 1}): {e}")
+            except Exception as e:
+                error_logger.error(f"Unexpected error during {provider_name} search (attempt {attempt + 1}): {e}")
+        if results:
+            break
+
+    success = bool(results)
+    _record_query(query, success, cache_hit=False)
+
+    if success:
+        results = rank_search_results(query, results)
+        try:
+            expiry = datetime.now() + _cache_duration_for_query(query)
+            cache_data = {
+                "timestamp": datetime.now().isoformat(),
+                "expiry": expiry.isoformat(),
+                "data": results,
+            }
+            with open(cache_file, "w", encoding="utf-8") as f:
+                json.dump(cache_data, f)
+            search_cache_index[cache_key] = datetime.now()
+            cleanup_cache(SEARCH_CACHE_DIR, search_cache_index, timedelta(hours=1))
+        except Exception as e:
+            logger.warning(f"Failed to write search cache for {query}: {e}")
+
+    if not success:
+        logger.error(f"All search providers failed for query: {query}")
+
+    return results
+
+
+# ----------------------------------------------------------------------
+# Cache invalidation
+# ----------------------------------------------------------------------
+def invalidate_search_cache(query: Optional[str] = None) -> None:
+    """Invalidate cached search results. None clears all, otherwise just the given query."""
+    if query is None:
+        for file in SEARCH_CACHE_DIR.glob("*.cache"):
+            try:
+                file.unlink(missing_ok=True)
+            except Exception as e:
+                error_logger.warning(f"Failed to delete cache file {file}: {e}")
+        search_cache_index.clear()
+        logger.info("All search cache entries have been cleared.")
+    else:
+        cache_key = generate_cache_key(f"{query}|10|None")
+        cache_file = SEARCH_CACHE_DIR / f"{cache_key}.cache"
+        if cache_file.exists():
+            try:
+                cache_file.unlink(missing_ok=True)
+                search_cache_index.pop(cache_key, None)
+                logger.info(f"Cache entry for query '{query}' has been invalidated.")
+            except Exception as e:
+                error_logger.warning(f"Failed to delete cache file for query '{query}': {e}")
+        else:
+            logger.info(f"No cache entry found for query '{query}'.")
+
+
+# ----------------------------------------------------------------------
+# Comprehensive web search (with advanced filtering)
+# ----------------------------------------------------------------------
+def comprehensive_web_search(
+    query: str,
+    max_pages: int = 3,
+    max_workers: int = 4,
+    time_filter: str = None,
+    domain_whitelist: Optional[Set[str]] = None,
+    domain_blacklist: Optional[Set[str]] = None,
+    content_type: Optional[str] = None,
+    language: Optional[str] = None,
+    min_content_length: int = 0,
+    return_sources: bool = False,
+):
+    """Perform comprehensive web search with content fetching and advanced filtering."""
+    logger.info(f"Starting comprehensive search for: {query}")
+    if time_filter:
+        logger.info(f"Applying time filter: {time_filter}")
+
+    settings = _get_search_settings()
+    search_provider = settings.get("search_provider", "searxng")
+    result_count = _get_result_count()
+
+    if search_provider == "disabled":
+        logger.info("Search is disabled via admin settings")
+        msg = "Web search is disabled by the administrator."
+        return (msg, []) if return_sources else msg
+
+    # Use configured result count (at least max_pages for content fetching)
+    fetch_count = max(result_count, max_pages)
+
+    provider_chain = _build_provider_chain(search_provider)
+
+    search_results = []
+    provider_attempts = {}
+    for provider_name in provider_chain:
+        last_err = None
+        empty = False
+        for attempt in range(2):
+            try:
+                search_results = _call_provider(provider_name, query, fetch_count, time_filter)
+                if search_results:
+                    provider_attempts[provider_name] = f"ok ({len(search_results)})"
+                    logger.info(f"Comprehensive search: {provider_name} returned {len(search_results)} results")
+                    break
+                empty = True
+            except Exception as e:
+                last_err = e
+                logger.warning(f"Comprehensive search: {provider_name} attempt {attempt + 1} failed: {e}")
+        if search_results:
+            break
+        if last_err is not None:
+            provider_attempts[provider_name] = f"error: {last_err}"
+        elif empty:
+            provider_attempts[provider_name] = "empty"
+
+    if not search_results:
+        tally = ", ".join(f"{p}:{r}" for p, r in provider_attempts.items()) or "no providers configured"
+        any_errors = any(r.startswith("error") for r in provider_attempts.values())
+        if any_errors:
+            msg = f"Web search failed — all providers errored or returned empty. Tried: {tally}"
+        else:
+            msg = (
+                f"No search results found. Tried: {tally}. "
+                "All providers returned empty — possibly a niche query or upstream rate-limiting; "
+                "rephrasing or using the browser tool for a specific URL may help."
+            )
+        logger.warning(msg)
+        return (msg, []) if return_sources else msg
+
+    search_results = rank_search_results(query, search_results)
+
+    # URL filter helper
+    def url_passes_filters(url: str) -> bool:
+        try:
+            netloc = urlparse(url).netloc.lower()
+        except Exception:
+            return False
+        if domain_whitelist is not None and netloc not in domain_whitelist:
+            return False
+        if domain_blacklist is not None and netloc in domain_blacklist:
+            return False
+        if content_type:
+            ct = content_type.lower()
+            if ct == "article":
+                if not any(k in url.lower() for k in ("article", "blog", "news", "post")):
+                    return False
+            elif ct == "forum":
+                if not any(k in url.lower() for k in ("forum", "discussion", "thread", "topic")):
+                    return False
+            elif ct == "academic":
+                if not any(k in url.lower() for k in ("pdf", "doi", "scholar", "arxiv", "journal", "research")):
+                    return False
+        if language:
+            lang_pat = language.lower()
+            if not (f"/{lang_pat}/" in url.lower() or f"?lang={lang_pat}" in url.lower() or f"&lang={lang_pat}" in url.lower()):
+                return False
+        return True
+
+    filtered_urls = [r["url"] for r in search_results[:max_pages] if url_passes_filters(r["url"])]
+    if not filtered_urls:
+        logger.warning("All URLs filtered out by advanced criteria")
+        msg = "No suitable results after applying filters."
+        return (msg, []) if return_sources else msg
+
+    # Build sources list for the frontend (before content fetching)
+    _source_list = [
+        {"url": r.get("url", ""), "title": r.get("title", "")}
+        for r in search_results if r.get("url")
+    ]
+
+    # Fetch content in parallel
+    fetched_content = []
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        future_to_url = {
+            executor.submit(fetch_webpage_content, url, 8, retry_attempt=0): url
+            for url in filtered_urls
+        }
+        for future in as_completed(future_to_url):
+            url = future_to_url[future]
+            try:
+                result = future.result()
+                if result["success"] and result["content"] and len(result["content"]) >= min_content_length:
+                    fetched_content.append(result)
+            except Exception as e:
+                logger.error(f"Exception while fetching {url}: {str(e)}")
+
+    logger.info(f"Successfully fetched content from {len(fetched_content)} pages")
+
+    # Format results
+    output_parts = []
+
+    if search_results:
+        output_parts.append("```sources")
+        for i, result in enumerate(search_results, 1):
+            output_parts.append(f"[{i}] {result['title']}")
+            output_parts.append(f"    {result['url']}")
+            if result.get("age"):
+                output_parts.append(f"    {result['age']}")
+        output_parts.append("```")
+        output_parts.append("")
+
+    output_parts.append("=" * 70)
+    output_parts.append("WEB SEARCH RESULTS AND FETCHED CONTENT")
+    output_parts.append(f"Query: {query}")
+    output_parts.append(f"Searched {len(search_results)} results, fetched {len(fetched_content)} pages")
+    output_parts.append("=" * 70)
+    output_parts.append("")
+
+    output_parts.append("SEARCH RESULTS SUMMARY:")
+    output_parts.append("-" * 50)
+    for i, result in enumerate(search_results, 1):
+        output_parts.append(f"\n[{i}] {result['title']}")
+        output_parts.append(f"    URL: {result['url']}")
+        output_parts.append(f"    Snippet: {result['snippet'][:200]}...")
+        if result.get("age"):
+            output_parts.append(f"    Age: {result['age']}")
+
+    if fetched_content:
+        output_parts.append("\n" + "=" * 70)
+        output_parts.append("FETCHED PAGE CONTENT:")
+        output_parts.append("-" * 50)
+
+        for i, content in enumerate(fetched_content, 1):
+            output_parts.append(f"\n[CONTENT {i}] From: {content['url']}")
+            output_parts.append(f"Title: {content['title']}")
+            output_parts.append("-" * 30)
+
+            text = content["content"][:3000]
+            if len(content["content"]) > 3000:
+                text += "... [truncated]"
+            output_parts.append(text)
+
+            key_points = extract_key_points(content["content"])
+            if key_points:
+                output_parts.append("\nKey Points:")
+                for pt in key_points[:5]:
+                    output_parts.append(f"- {pt}")
+
+            tldr = get_tldr(content["content"])
+            if tldr:
+                output_parts.append("\nTL;DR:")
+                output_parts.append(tldr)
+
+            quotes = extract_quotes(content["content"])
+            if quotes:
+                output_parts.append("\nImportant Quotes:")
+                for q in quotes[:3]:
+                    output_parts.append(f"\u201c{q}\u201d")
+
+            stats = extract_statistics(content["content"])
+            if stats:
+                output_parts.append("\nData / Statistics:")
+                for s in stats[:5]:
+                    output_parts.append(f"- {s}")
+
+            output_parts.append("")
+
+    output_parts.append("=" * 70)
+    output_parts.append("END OF WEB SEARCH RESULTS")
+    output_parts.append("=" * 70)
+
+    instructions = (
+        "\n\nIMPORTANT INSTRUCTIONS:\n"
+        "1. Use the above web search results and fetched content to answer the user's question\n"
+        "2. Prioritize information from the FETCHED PAGE CONTENT section as it contains actual page data\n"
+        "3. Cross-reference multiple sources when possible\n"
+        "4. If the information is time-sensitive, pay attention to the age of the results\n"
+        "5. Be explicit if the search results don't contain sufficient information to fully answer the question"
+    )
+    output_parts.append(instructions)
+
+    result = "\n".join(output_parts)
+    return (result, _source_list) if return_sources else result
@@ -0,0 +1,527 @@
+"""Search provider implementations: SearXNG, Brave, DuckDuckGo, Google PSE, Tavily, Serper."""
+
+import json
+import logging
+import os
+from typing import List, Optional
+
+import httpx
+from bs4 import BeautifulSoup
+
+from src.constants import SEARXNG_INSTANCE
+from .analytics import RateLimitError, error_logger
+from .query import build_enhanced_query
+
+logger = logging.getLogger(__name__)
+
+REQUEST_TIMEOUT = 20
+
+# Provider registry — maps setting value to (label, needs_key, needs_url)
+PROVIDER_INFO = {
+    "searxng":  ("SearXNG",           False, True),
+    "brave":    ("Brave Search",      True,  False),
+    "duckduckgo": ("DuckDuckGo",      False, False),
+    "google_pse": ("Google PSE",      True,  False),
+    "tavily":   ("Tavily",            True,  False),
+    "serper":   ("Serper",            True,  False),
+    "disabled": ("Disabled",          False, False),
+}
+
+
+# ── Settings helpers ──
+
+def _get_search_settings() -> dict:
+    """Return search settings from admin config, falling back to env defaults."""
+    try:
+        from src.settings import load_settings
+        return load_settings()
+    except Exception:
+        return {}
+
+
+def _get_search_instance() -> str:
+    """Return the active search API URL from admin settings, falling back to env var."""
+    settings = _get_search_settings()
+    url = (settings.get("search_url") or "").strip()
+    if url:
+        return url.rstrip("/")
+    return SEARXNG_INSTANCE
+
+
+def _get_provider_key(provider: str) -> str:
+    """Return the API key for a specific provider, with legacy fallback."""
+    settings = _get_search_settings()
+    key_map = {
+        "brave": "brave_api_key",
+        "google_pse": "google_pse_key",
+        "tavily": "tavily_api_key",
+        "serper": "serper_api_key",
+    }
+    field = key_map.get(provider, "")
+    if field:
+        val = (settings.get(field) or "").strip()
+        if val:
+            return val
+    # Legacy fallback: old shared search_api_key field
+    return (settings.get("search_api_key") or "").strip()
+
+
+def _get_result_count() -> int:
+    """Return configured result count, default 5."""
+    settings = _get_search_settings()
+    try:
+        return int(settings.get("search_result_count", 5))
+    except (ValueError, TypeError):
+        return 5
+
+
+# ── SearXNG ──
+
+_NEWS_HINTS = ("news", "nyheter", "headlines", "breaking", "latest", "today", "idag")
+
+# Default general engines (google/duckduckgo/brave/startpage/wikipedia) are
+# routinely rate-limited / CAPTCHA-blocked on this instance and return nothing.
+# Pin engines that actually respond so non-news queries get results without any
+# third-party API fallback. Override via SEARXNG_GENERAL_ENGINES.
+_GENERAL_ENGINES = os.environ.get("SEARXNG_GENERAL_ENGINES", "bing,mojeek,presearch")
+
+
+def searxng_search_api(query: str, count: int = 10, categories: str = "general",
+                       time_filter: Optional[str] = None) -> List[dict]:
+    """Search using SearXNG JSON API. Returns list of {title, url, snippet}."""
+    instance = _get_search_instance()
+    api_key = ""
+    headers = {"User-Agent": "Mozilla/5.0"}
+    if api_key:
+        headers["Authorization"] = f"Bearer {api_key}"
+    # News/fresh queries do badly in the 'general' category — it favours
+    # encyclopedic/tourism pages, ignores recency, and (with no language pin)
+    # bleeds in foreign-language results. When the agent layer detected
+    # freshness (time_filter) or the query reads like a news lookup, switch to
+    # the 'news' category, constrain recency, and pin language to English so a
+    # search like "Canada latest news" returns actual news instead of Wikipedia.
+    # Pin English for ALL searches — without it, SearXNG geolocates / mixes
+    # languages and brand-ambiguous terms bleed in foreign SEO pages (e.g.
+    # "Odyssey" → Honda Japan, "Trojan" → Japanese malware blogs, "Polyphemus"
+    # → Chinese math forums). The news path already did this; general didn't.
+    params = {"q": query, "format": "json", "language": "en"}
+    q_lc = query.lower()
+    is_news = time_filter is not None or any(h in q_lc for h in _NEWS_HINTS)
+    if is_news and categories == "general":
+        params["categories"] = "news"
+        if time_filter in ("day", "week", "month", "year"):
+            # 'day' is too sparse on most SearXNG news engines — widen to a week
+            # so there's enough volume; the news category already biases recent.
+            params["time_range"] = "week" if time_filter in ("day", "week") else time_filter
+    else:
+        params["categories"] = categories
+        # Route general queries to engines that aren't blocked (default general
+        # set returns 0 on this instance — see _GENERAL_ENGINES).
+        if categories == "general" and _GENERAL_ENGINES:
+            params["engines"] = _GENERAL_ENGINES
+    try:
+        def _parse_results(results):
+            return [
+                {
+                    "title": r.get("title", ""),
+                    "url": r.get("url", ""),
+                    "snippet": r.get("content", ""),
+                }
+                for r in results[:count]
+                if r.get("url")
+            ]
+
+        def _run(search_params):
+            response = httpx.get(
+                f"{instance}/search",
+                params=search_params,
+                headers=headers or None,
+                timeout=15,
+            )
+            response.raise_for_status()
+            data = response.json()
+            return _parse_results(data.get("results", [])), data
+
+        active_params = params
+        parsed, data = _run(active_params)
+        if not parsed and is_news and categories == "general":
+            # Some self-hosted SearXNG configs have no working news engines.
+            # Fall back to the known-good general engines before reporting an
+            # empty search, otherwise common queries like "Canada news" fail.
+            fallback = {
+                "q": query,
+                "format": "json",
+                "language": "en",
+                "categories": "general",
+            }
+            if _GENERAL_ENGINES:
+                fallback["engines"] = _GENERAL_ENGINES
+            logger.info(
+                "SearXNG news search returned 0 results for %r; retrying general engines",
+                query,
+            )
+            active_params = fallback
+            parsed, data = _run(active_params)
+        if not parsed and active_params.get("language"):
+            fallback = dict(active_params)
+            fallback.pop("language", None)
+            logger.info(
+                "SearXNG language-pinned search returned 0 results for %r; retrying without language",
+                query,
+            )
+            active_params = fallback
+            parsed, data = _run(active_params)
+        if not parsed and active_params.get("engines"):
+            fallback = dict(active_params)
+            fallback.pop("engines", None)
+            logger.info(
+                "SearXNG pinned engines returned 0 results for %r; retrying default engines",
+                query,
+            )
+            parsed, data = _run(fallback)
+        logger.info(f"SearXNG JSON API returned {len(parsed)} results for: {query}")
+        if not parsed:
+            unresponsive = data.get("unresponsive_engines") if isinstance(data, dict) else None
+            if unresponsive:
+                logger.info(f"SearXNG unresponsive engines for {query!r}: {unresponsive}")
+        return parsed
+    except Exception as e:
+        logger.warning(f"SearXNG JSON API search failed: {e}")
+        html_results = searxng_search(query, max_results=count)
+        if html_results:
+            logger.info(f"SearXNG HTML fallback returned {len(html_results)} results for: {query}")
+        return html_results
+
+
+def searxng_search(query, max_results=10):
+    """Search using SearXNG instance - parsing HTML."""
+    instance = _get_search_instance()
+    api_key = ""
+    req_headers = {"User-Agent": "Mozilla/5.0"}
+    if api_key:
+        req_headers["Authorization"] = f"Bearer {api_key}"
+    try:
+        response = httpx.get(
+            f"{instance}/search",
+            params={"q": query},
+            headers=req_headers,
+            timeout=10,
+        )
+        if response.is_success:
+            soup = BeautifulSoup(response.text, "html.parser")
+            results = []
+            for article in soup.select("article.result")[:max_results]:
+                title_elem = article.select_one("h3 a")
+                if not title_elem:
+                    continue
+                title = title_elem.get_text(strip=True)
+                url = title_elem.get("href", "")
+                snippet_elem = article.select_one("p.content")
+                snippet = snippet_elem.get_text(strip=True) if snippet_elem else ""
+                results.append({"title": title, "url": url, "snippet": snippet})
+            logger.info(f"SearXNG search (HTML) returned {len(results)} results")
+            return results
+    except Exception as e:
+        logger.error(f"SearXNG search failed: {e}")
+    return []
+
+
+# ── Brave ──
+
+def brave_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
+    """Search using Brave API with key from admin settings or env var."""
+    api_key = _get_provider_key("brave") or os.environ.get("DATA_BRAVE_API_KEY") or ""
+    return _brave_search_impl(query, count, time_filter, search_config={"brave_api_key": api_key})
+
+
+def _brave_search_impl(query: str, count: int, time_filter: Optional[str] = None, search_config: dict = None) -> List[dict]:
+    """Core Brave API call. Returns a list of result dicts or an empty list on failure."""
+    enhanced_query = build_enhanced_query(query, time_filter)
+    config = search_config or {}
+
+    brave_api_key = config.get("brave_api_key")
+    if not brave_api_key:
+        brave_api_key = os.environ.get("DATA_BRAVE_API_KEY")
+
+    if not brave_api_key:
+        logger.warning("Brave API key not found, returning empty results for fallback")
+        return []
+
+    headers = {"X-Subscription-Token": brave_api_key, "Accept": "application/json"}
+    params = {"q": enhanced_query, "count": count}
+    if time_filter:
+        time_map = {"day": "day", "week": "week", "month": "month", "year": "year"}
+        if time_filter in time_map:
+            params["freshness"] = time_map[time_filter]
+
+    logger.info(f"Executing Brave search with query: {enhanced_query}")
+    try:
+        response = httpx.get(
+            "https://api.search.brave.com/res/v1/web/search",
+            headers=headers,
+            params=params,
+            timeout=REQUEST_TIMEOUT,
+        )
+        if response.status_code == 429:
+            raise RateLimitError("Brave rate limit hit")
+        response.raise_for_status()
+    except httpx.RequestError as e:
+        error_logger.error(f"NetworkError during Brave search: {e}")
+        return []
+    except RateLimitError as e:
+        error_logger.error(str(e))
+        return []
+
+    try:
+        data = response.json()
+    except json.JSONDecodeError as e:
+        logger.error(f"Failed to parse Brave API response: {e}")
+        return []
+
+    results = []
+    if "web" in data and "results" in data["web"]:
+        for item in data["web"]["results"][:count]:
+            url = item.get("url", "")
+            if not url:
+                continue
+            results.append({
+                "title": item.get("title", ""),
+                "url": url,
+                "snippet": item.get("description", "") or item.get("content", ""),
+                "age": item.get("date", "") if item.get("date") else "",
+            })
+
+    logger.info(f"Brave search returned {len(results)} results")
+    return results
+
+
+# ── DuckDuckGo (free, no key) ──
+
+def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
+    """Search using DuckDuckGo via the duckduckgo-search library. No API key needed."""
+    def _html_fallback() -> List[dict]:
+        try:
+            response = httpx.get(
+                "https://html.duckduckgo.com/html/",
+                params={"q": query},
+                headers={"User-Agent": "Mozilla/5.0"},
+                timeout=REQUEST_TIMEOUT,
+            )
+            response.raise_for_status()
+            soup = BeautifulSoup(response.text, "html.parser")
+            parsed = []
+            for result in soup.select(".result")[:count]:
+                link = result.select_one(".result__a")
+                if not link:
+                    continue
+                url = link.get("href", "")
+                if not url:
+                    continue
+                snippet_el = result.select_one(".result__snippet")
+                parsed.append({
+                    "title": link.get_text(" ", strip=True),
+                    "url": url,
+                    "snippet": snippet_el.get_text(" ", strip=True) if snippet_el else "",
+                })
+            logger.info(f"DuckDuckGo HTML search returned {len(parsed)} results")
+            return parsed
+        except Exception as e:
+            logger.warning(f"DuckDuckGo HTML search failed: {e}")
+            return []
+
+    try:
+        from duckduckgo_search import DDGS
+    except ImportError:
+        logger.warning("duckduckgo-search package not installed; using HTML fallback")
+        return _html_fallback()
+
+    timelimit = None
+    if time_filter:
+        time_map = {"day": "d", "week": "w", "month": "m", "year": "y"}
+        timelimit = time_map.get(time_filter)
+
+    try:
+        ddgs = DDGS()
+        raw = ddgs.text(query, max_results=count, timelimit=timelimit)
+        results = []
+        for item in raw:
+            url = item.get("href", "")
+            if not url:
+                continue
+            results.append({
+                "title": item.get("title", ""),
+                "url": url,
+                "snippet": item.get("body", ""),
+            })
+        logger.info(f"DuckDuckGo search returned {len(results)} results")
+        return results or _html_fallback()
+    except Exception as e:
+        logger.warning(f"DuckDuckGo search failed: {e}")
+        return _html_fallback()
+
+
+# ── Google Programmable Search Engine ──
+
+def google_pse_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
+    """Search using Google PSE (Custom Search JSON API).
+
+    Requires two keys in settings:
+      - search_api_key: Google API key
+      - google_pse_cx: Programmable Search Engine ID (cx)
+    Or env vars GOOGLE_API_KEY and GOOGLE_PSE_CX.
+    """
+    settings = _get_search_settings()
+    api_key = _get_provider_key("google_pse") or os.environ.get("GOOGLE_API_KEY", "")
+    cx = (settings.get("google_pse_cx") or "").strip() or os.environ.get("GOOGLE_PSE_CX", "")
+
+    if not api_key or not cx:
+        logger.warning("Google PSE: missing API key or CX ID")
+        return []
+
+    params = {
+        "key": api_key,
+        "cx": cx,
+        "q": query,
+        "num": min(count, 10),  # Google PSE max is 10 per request
+    }
+    if time_filter:
+        # dateRestrict: d[number], w[number], m[number], y[number]
+        time_map = {"day": "d1", "week": "w1", "month": "m1", "year": "y1"}
+        if time_filter in time_map:
+            params["dateRestrict"] = time_map[time_filter]
+
+    try:
+        response = httpx.get(
+            "https://www.googleapis.com/customsearch/v1",
+            params=params,
+            timeout=REQUEST_TIMEOUT,
+        )
+        if response.status_code == 429:
+            raise RateLimitError("Google PSE rate limit hit")
+        response.raise_for_status()
+        data = response.json()
+    except httpx.RequestError as e:
+        error_logger.error(f"Google PSE search failed: {e}")
+        return []
+    except RateLimitError as e:
+        error_logger.error(str(e))
+        return []
+
+    results = []
+    for item in data.get("items", [])[:count]:
+        url = item.get("link", "")
+        if not url:
+            continue
+        results.append({
+            "title": item.get("title", ""),
+            "url": url,
+            "snippet": item.get("snippet", ""),
+        })
+
+    logger.info(f"Google PSE returned {len(results)} results")
+    return results
+
+
+# ── Tavily ──
+
+def tavily_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
+    """Search using Tavily API. Requires search_api_key or TAVILY_API_KEY env var."""
+    api_key = _get_provider_key("tavily") or os.environ.get("TAVILY_API_KEY", "")
+    if not api_key:
+        logger.warning("Tavily: no API key configured")
+        return []
+
+    payload = {
+        "query": query,
+        "max_results": count,
+        "include_answer": False,
+    }
+    if time_filter:
+        time_map = {"day": "day", "week": "week", "month": "month", "year": "year"}
+        if time_filter in time_map:
+            payload["days"] = {"day": 1, "week": 7, "month": 30, "year": 365}[time_filter]
+
+    try:
+        response = httpx.post(
+            "https://api.tavily.com/search",
+            json=payload,
+            headers={"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"},
+            timeout=REQUEST_TIMEOUT,
+        )
+        if response.status_code == 429:
+            raise RateLimitError("Tavily rate limit hit")
+        response.raise_for_status()
+        data = response.json()
+    except httpx.RequestError as e:
+        error_logger.error(f"Tavily search failed: {e}")
+        return []
+    except RateLimitError as e:
+        error_logger.error(str(e))
+        return []
+
+    results = []
+    for item in data.get("results", [])[:count]:
+        url = item.get("url", "")
+        if not url:
+            continue
+        results.append({
+            "title": item.get("title", ""),
+            "url": url,
+            "snippet": item.get("content", ""),
+            "age": item.get("published_date", ""),
+        })
+
+    logger.info(f"Tavily returned {len(results)} results")
+    return results
+
+
+# ── Serper.dev ──
+
+def serper_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
+    """Search using Serper.dev API. Requires search_api_key or SERPER_API_KEY env var."""
+    api_key = _get_provider_key("serper") or os.environ.get("SERPER_API_KEY", "")
+    if not api_key:
+        logger.warning("Serper: no API key configured")
+        return []
+
+    payload = {
+        "q": query,
+        "num": count,
+    }
+    if time_filter:
+        time_map = {"day": "qdr:d", "week": "qdr:w", "month": "qdr:m", "year": "qdr:y"}
+        if time_filter in time_map:
+            payload["tbs"] = time_map[time_filter]
+
+    try:
+        response = httpx.post(
+            "https://google.serper.dev/search",
+            json=payload,
+            headers={"X-API-KEY": api_key, "Content-Type": "application/json"},
+            timeout=REQUEST_TIMEOUT,
+        )
+        if response.status_code == 429:
+            raise RateLimitError("Serper rate limit hit")
+        response.raise_for_status()
+        data = response.json()
+    except httpx.RequestError as e:
+        error_logger.error(f"Serper search failed: {e}")
+        return []
+    except RateLimitError as e:
+        error_logger.error(str(e))
+        return []
+
+    results = []
+    for item in data.get("organic", [])[:count]:
+        url = item.get("link", "")
+        if not url:
+            continue
+        results.append({
+            "title": item.get("title", ""),
+            "url": url,
+            "snippet": item.get("snippet", ""),
+            "age": item.get("date", ""),
+        })
+
+    logger.info(f"Serper returned {len(results)} results")
+    return results
@@ -0,0 +1,128 @@
+"""Query enhancement, entity extraction, and cache duration helpers."""
+
+import re
+import logging
+from datetime import timedelta
+from typing import Dict, List, Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+
+# ----------------------------------------------------------------------
+# Query processing helpers
+# ----------------------------------------------------------------------
+def _detect_question_type(query: str) -> Optional[str]:
+    """Return the leading question word if present (who, what, when, where, why, how)."""
+    q = query.strip().lower()
+    for word in ("who", "what", "when", "where", "why", "how"):
+        if q.startswith(word):
+            return word
+    return None
+
+
+def _extract_entities(query: str) -> Dict[str, List[str]]:
+    """Lightweight entity extraction: capitalized words and date patterns."""
+    entities: Dict[str, List[str]] = {"names": [], "dates": []}
+    qtype = _detect_question_type(query)
+    cleaned = query
+    if qtype:
+        cleaned = re.sub(rf"^{qtype}\b", "", cleaned, flags=re.I).strip()
+    for token in re.findall(r"\b[A-Z][a-zA-Z]+\b", cleaned):
+        entities["names"].append(token)
+    for year in re.findall(r"\b(19|20)\d{2}\b", cleaned):
+        entities["dates"].append(year)
+    month_day_year = re.findall(
+        r"\b(?:Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|August|Sep|Sept|September|Oct|October|Nov|November|Dec|December)\s+\d{1,2},?\s*\d{4}\b",
+        cleaned,
+        flags=re.I,
+    )
+    entities["dates"].extend(month_day_year)
+    return entities
+
+
+def _split_multi_part(query: str) -> List[str]:
+    """Split a query into sub-queries on common conjunctions."""
+    parts = re.split(r"\s+and\s+|\s+or\s+|;", query, flags=re.I)
+    return [p.strip() for p in parts if p.strip()]
+
+
+def _extract_site_filter(query: str) -> Tuple[str, Optional[str]]:
+    """Detect a 'site:example.com' token. Returns (query_without_token, site_or_None)."""
+    match = re.search(r"\bsite:([^\s]+)", query, flags=re.I)
+    if match:
+        site = match.group(1)
+        new_query = re.sub(r"\bsite:[^\s]+", "", query, flags=re.I).strip()
+        return new_query, site
+    return query, None
+
+
+def _boost_entities_in_query(base_query: str, entities: Dict[str, List[str]]) -> str:
+    """Append extracted entities to the query using OR to increase relevance."""
+    parts = [base_query]
+    if entities.get("names"):
+        parts.append(" OR ".join(f'"{n}"' for n in entities["names"]))
+    if entities.get("dates"):
+        parts.append(" OR ".join(f'"{d}"' for d in entities["dates"]))
+    return " ".join(parts)
+
+
+def enhance_query(original_query: str) -> Tuple[str, Optional[str]]:
+    """Process the original query: site filter, question type boosts, entity extraction."""
+    query_without_site, site = _extract_site_filter(original_query)
+    sub_queries = _split_multi_part(query_without_site)
+
+    enhanced_subs: List[str] = []
+    for sub in sub_queries:
+        qtype = _detect_question_type(sub)
+        boost_keywords = []
+        if qtype == "who":
+            boost_keywords.append("person")
+        elif qtype == "when":
+            boost_keywords.append("date")
+        elif qtype == "where":
+            boost_keywords.append("location")
+        elif qtype == "why":
+            boost_keywords.append("reason")
+        elif qtype == "how":
+            boost_keywords.append("method")
+        entities = _extract_entities(sub)
+        boosted = _boost_entities_in_query(sub, entities)
+        if boost_keywords:
+            boosted = f'({boosted}) OR ({" OR ".join(boost_keywords)})'
+        enhanced_subs.append(boosted)
+
+    final_query = " AND ".join(f"({s})" for s in enhanced_subs)
+    if site:
+        final_query = f"{final_query} site:{site}"
+    return final_query, site
+
+
+def build_enhanced_query(query: str, time_filter: str = None) -> str:
+    """Build an enhanced search query with optional time filtering."""
+    enhanced_query, _ = enhance_query(query)
+
+    if time_filter:
+        time_map = {"day": "d", "week": "w", "month": "m", "year": "y"}
+        if time_filter in time_map:
+            enhanced_query = f"{enhanced_query} after:{time_map[time_filter]}"
+            logger.info(f"Added time filter '{time_filter}' to query")
+
+    logger.info(f"Enhanced query: '{query}' -> '{enhanced_query}'")
+    return enhanced_query
+
+
+# ----------------------------------------------------------------------
+# Cache duration helpers
+# ----------------------------------------------------------------------
+def _is_news_query(query: str) -> bool:
+    """Lightweight heuristic to decide if a query is news-oriented."""
+    news_terms = {"news", "latest", "breaking", "today", "today's", "current", "updates", "happening"}
+    tokens = set(re.findall(r"\b\w+\b", query.lower()))
+    return bool(tokens & news_terms)
+
+
+def _cache_duration_for_query(query: str) -> timedelta:
+    """News queries -> 30 minutes, reference queries -> 24 hours."""
+    if _is_news_query(query):
+        return timedelta(minutes=30)
+    return timedelta(hours=24)
@@ -0,0 +1,127 @@
+"""Search result ranking based on relevance, source quality, and recency."""
+
+import re
+import logging
+from datetime import datetime
+from typing import List, Optional
+from urllib.parse import urlparse
+
+logger = logging.getLogger(__name__)
+
+_NEWS_HINTS = {"news", "nyheter", "headlines", "breaking", "latest", "today", "idag"}
+_SPORTS_HINTS = {
+    "sport", "sports", "soccer", "football", "hockey", "nba", "nfl", "mlb",
+    "fifa", "world cup", "championship", "quarterfinal", "eliminates",
+}
+_LOW_VALUE_NEWS_DOMAINS = {
+    "facebook.com", "www.facebook.com", "sports.yahoo.com", "yahoo.com",
+    "www.yahoo.com", "msn.com", "www.msn.com",
+}
+_TRUSTED_NEWS_DOMAINS = {
+    "apnews.com", "www.apnews.com", "reuters.com", "www.reuters.com",
+    "bbc.com", "www.bbc.com", "cbc.ca", "www.cbc.ca",
+    "ctvnews.ca", "www.ctvnews.ca", "globalnews.ca", "www.globalnews.ca",
+    "theguardian.com",
+    "www.theguardian.com", "euronews.com", "www.euronews.com",
+    "dw.com", "www.dw.com", "government.se", "www.government.se",
+}
+
+
+def _domain(url: str) -> str:
+    try:
+        return urlparse(url).netloc.lower()
+    except Exception:
+        return ""
+
+
+def rank_search_results(query: str, results: List[dict]) -> List[dict]:
+    """Rank search results by title relevance, snippet quality, domain authority, and recency."""
+    query_terms = [t.lower() for t in re.findall(r"\b\w+\b", query)]
+    query_lc = query.lower()
+    is_news_query = any(term in _NEWS_HINTS for term in query_terms)
+    is_sports_query = any(hint in query_lc for hint in _SPORTS_HINTS)
+
+    def title_score(title: str) -> float:
+        if not title:
+            return 0.0
+        title_lc = title.lower()
+        matches = sum(1 for term in query_terms if re.search(rf"\b{re.escape(term)}\b", title_lc))
+        return matches / len(query_terms) if query_terms else 0.0
+
+    def snippet_score(snippet: str) -> float:
+        if not snippet:
+            return 0.0
+        length_factor = min(len(snippet), 200) / 200
+        term_hits = sum(1 for term in query_terms if term in snippet.lower())
+        term_factor = term_hits / len(query_terms) if query_terms else 0.0
+        return (length_factor + term_factor) / 2
+
+    def domain_score(url: str) -> float:
+        netloc = _domain(url)
+        if not netloc:
+            return 0.0
+        if netloc in _TRUSTED_NEWS_DOMAINS:
+            return 1.0
+        if netloc.endswith(".edu") or netloc.endswith(".gov"):
+            return 1.0
+        if netloc.endswith(".org"):
+            return 0.7
+        return 0.4
+
+    def recency_score(age_str: Optional[str]) -> float:
+        if not age_str:
+            return 0.0
+        for fmt in ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S"):
+            try:
+                dt = datetime.strptime(age_str, fmt)
+                break
+            except Exception:
+                dt = None
+        if not dt:
+            return 0.0
+        days_old = (datetime.now() - dt).days
+        if days_old <= 7:
+            return 1.0
+        if days_old >= 30:
+            return 0.0
+        return (30 - days_old) / 23
+
+    def news_quality_adjustment(title: str, snippet: str, url: str) -> float:
+        if not is_news_query:
+            return 0.0
+        text = f"{title} {snippet}".lower()
+        netloc = _domain(url)
+        adjustment = 0.0
+        if netloc in _TRUSTED_NEWS_DOMAINS:
+            adjustment += 1.2
+        if any(term in text for term in ("latest news", "breaking news", "daily coverage", "news from")):
+            adjustment += 0.4
+        if netloc in _LOW_VALUE_NEWS_DOMAINS:
+            adjustment -= 0.8
+        if not is_sports_query and any(hint in text or hint in netloc for hint in _SPORTS_HINTS):
+            adjustment -= 1.5
+        # A country/news query should not rank a page whose title/snippet barely
+        # mentions the country above actual news pages for that country.
+        subject_terms = [t for t in query_terms if t not in _NEWS_HINTS]
+        if subject_terms and not any(t in text or t in netloc for t in subject_terms):
+            adjustment -= 1.0
+        return adjustment
+
+    ranked = []
+    for result in results:
+        title = result.get("title", "")
+        snippet = result.get("snippet", "")
+        url = result.get("url", "")
+        age = result.get("age", None)
+
+        score = (
+            2.0 * title_score(title)
+            + 1.0 * snippet_score(snippet)
+            + 1.5 * domain_score(url)
+            + 1.0 * recency_score(age)
+            + news_quality_adjustment(title, snippet, url)
+        )
+        ranked.append((score, result))
+
+    ranked.sort(key=lambda x: x[0], reverse=True)
+    return [r for _, r in ranked]
@@ -0,0 +1,95 @@
+# services/search/service.py
+"""Search service — clean interface for web search."""
+
+from dataclasses import dataclass
+from typing import List, Optional, Dict, Any
+
+from . import (
+    comprehensive_web_search,
+    fetch_webpage_content,
+    get_search_config,
+)
+
+
+@dataclass
+class SearchResult:
+    """A single search result."""
+    url: str
+    title: str
+    snippet: str
+    content: Optional[str] = None
+
+
+@dataclass
+class SearchResponse:
+    """Response from a search query."""
+    query: str
+    results: List[SearchResult]
+    total: int
+    cached: bool = False
+
+
+class SearchService:
+    """
+    Web search service.
+
+    Usage:
+        service = SearchService()
+        result = await service.search("python async patterns")
+        for r in result.results:
+            print(f"{r.title}: {r.url}")
+    """
+
+    def __init__(self, default_depth: int = 1, fetch_content: bool = True):
+        self.default_depth = default_depth
+        self.fetch_content = fetch_content
+
+    async def search(
+        self,
+        query: str,
+        depth: Optional[int] = None,
+        fetch_content: Optional[bool] = None,
+    ) -> SearchResponse:
+        """
+        Search the web.
+
+        Args:
+            query: Search query
+            depth: Search depth (1=quick, 2=thorough, 3=comprehensive)
+            fetch_content: Whether to fetch full page content
+
+        Returns:
+            SearchResponse with results
+        """
+        depth = depth or self.default_depth
+        fetch_content = fetch_content if fetch_content is not None else self.fetch_content
+
+        # Use existing search implementation
+        raw_results = await comprehensive_web_search(
+            query,
+            max_results=10 * depth,
+            fetch_content=fetch_content,
+        )
+
+        results = []
+        for r in raw_results:
+            results.append(SearchResult(
+                url=r.get("url", ""),
+                title=r.get("title", ""),
+                snippet=r.get("snippet", ""),
+                content=r.get("content"),
+            ))
+
+        return SearchResponse(
+            query=query,
+            results=results,
+            total=len(results),
+        )
+
+    async def fetch_content(self, url: str) -> Optional[str]:
+        """Fetch content from a URL."""
+        return await fetch_webpage_content(url)
+
+    def get_config(self) -> Dict[str, Any]:
+        """Get current search configuration."""
+        return get_search_config()