fix: resolve DuckDuckGo redirect URLs in HTML fallback search

The DuckDuckGo HTML fallback returns redirect URLs (//duckduckgo.com/l/?uddg=...) instead of actual page URLs. This caused fetch_webpage_content() to reject them instantly because _public_http_url() requires an http/https scheme, making search results unfetchable in deep research mode. Added _resolve_url() to: - Convert protocol-relative URLs to absolute (https:) - Convert path-relative URLs to absolute - Extract the real URL from DuckDuckGo's /l/?uddg= redirect parameters
2026-06-16 09:45:24 -04:00 · 2026-06-01 19:42:01 -03:00
parent 7b9ef95b60
commit c0466274ed
2 changed files with 44 additions and 2 deletions
@@ -4,6 +4,7 @@ import json
 import logging
 import os
 from typing import List, Optional
+from urllib.parse import urljoin, urlparse, parse_qs

 import httpx
 from bs4 import BeautifulSoup
@@ -299,6 +300,25 @@ def _brave_search_impl(query: str, count: int, time_filter: Optional[str] = None

 def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
    """Search using DuckDuckGo via the duckduckgo-search library. No API key needed."""
+    def _resolve_url(raw: str) -> str:
+        """Resolve DuckDuckGo redirect URL to the actual destination URL."""
+        if not raw:
+            return raw
+        resolved = raw
+        if resolved.startswith("//"):
+            resolved = "https:" + resolved
+        elif resolved.startswith("/"):
+            resolved = urljoin("https://html.duckduckgo.com", resolved)
+        try:
+            parsed = urlparse(resolved)
+            if "duckduckgo.com" in (parsed.hostname or "") and parsed.path.rstrip("/") == "/l":
+                qs = parse_qs(parsed.query)
+                if "uddg" in qs:
+                    return qs["uddg"][0]
+        except Exception:
+            pass
+        return resolved
+
    def _html_fallback() -> List[dict]:
        try:
            response = httpx.get(
@@ -314,7 +334,7 @@ def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] =
                link = result.select_one(".result__a")
                if not link:
                    continue
-                url = link.get("href", "")
+                url = _resolve_url(link.get("href", ""))
                if not url:
                    continue
                snippet_el = result.select_one(".result__snippet")
@@ -4,6 +4,7 @@ import json
 import logging
 import os
 from typing import List, Optional
+from urllib.parse import urljoin, urlparse, parse_qs

 import httpx
 from bs4 import BeautifulSoup
@@ -300,6 +301,27 @@ def _brave_search_impl(query: str, count: int, time_filter: Optional[str] = None

 def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
    """Search using DuckDuckGo via the duckduckgo-search library. No API key needed."""
+    def _resolve_url(raw: str) -> str:
+        """Resolve DuckDuckGo redirect URL to the actual destination URL."""
+        if not raw:
+            return raw
+        # Handle protocol-relative URLs
+        resolved = raw
+        if resolved.startswith("//"):
+            resolved = "https:" + resolved
+        elif resolved.startswith("/"):
+            resolved = urljoin("https://html.duckduckgo.com", resolved)
+        # Extract the actual URL from DuckDuckGo's /l/?uddg= redirect
+        try:
+            parsed = urlparse(resolved)
+            if "duckduckgo.com" in (parsed.hostname or "") and parsed.path.rstrip("/") == "/l":
+                qs = parse_qs(parsed.query)
+                if "uddg" in qs:
+                    return qs["uddg"][0]
+        except Exception:
+            pass
+        return resolved
+
    def _html_fallback() -> List[dict]:
        try:
            response = httpx.get(
@@ -315,7 +337,7 @@ def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] =
                link = result.select_one(".result__a")
                if not link:
                    continue
-                url = link.get("href", "")
+                url = _resolve_url(link.get("href", ""))
                if not url:
                    continue
                snippet_el = result.select_one(".result__snippet")