fix: resolve DuckDuckGo redirect URLs in HTML fallback search

The DuckDuckGo HTML fallback returns redirect URLs (//duckduckgo.com/l/?uddg=...)
instead of actual page URLs. This caused fetch_webpage_content() to reject them
instantly because _public_http_url() requires an http/https scheme, making search
results unfetchable in deep research mode.
Added _resolve_url() to:
- Convert protocol-relative URLs to absolute (https:)
- Convert path-relative URLs to absolute
- Extract the real URL from DuckDuckGo's /l/?uddg= redirect parameters
This commit is contained in:
BSG-Walter
2026-06-01 19:42:01 -03:00
parent 7b9ef95b60
commit c0466274ed
2 changed files with 44 additions and 2 deletions
+21 -1
View File
@@ -4,6 +4,7 @@ import json
import logging
import os
from typing import List, Optional
from urllib.parse import urljoin, urlparse, parse_qs
import httpx
from bs4 import BeautifulSoup
@@ -299,6 +300,25 @@ def _brave_search_impl(query: str, count: int, time_filter: Optional[str] = None
def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
"""Search using DuckDuckGo via the duckduckgo-search library. No API key needed."""
def _resolve_url(raw: str) -> str:
"""Resolve DuckDuckGo redirect URL to the actual destination URL."""
if not raw:
return raw
resolved = raw
if resolved.startswith("//"):
resolved = "https:" + resolved
elif resolved.startswith("/"):
resolved = urljoin("https://html.duckduckgo.com", resolved)
try:
parsed = urlparse(resolved)
if "duckduckgo.com" in (parsed.hostname or "") and parsed.path.rstrip("/") == "/l":
qs = parse_qs(parsed.query)
if "uddg" in qs:
return qs["uddg"][0]
except Exception:
pass
return resolved
def _html_fallback() -> List[dict]:
try:
response = httpx.get(
@@ -314,7 +334,7 @@ def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] =
link = result.select_one(".result__a")
if not link:
continue
url = link.get("href", "")
url = _resolve_url(link.get("href", ""))
if not url:
continue
snippet_el = result.select_one(".result__snippet")
+23 -1
View File
@@ -4,6 +4,7 @@ import json
import logging
import os
from typing import List, Optional
from urllib.parse import urljoin, urlparse, parse_qs
import httpx
from bs4 import BeautifulSoup
@@ -300,6 +301,27 @@ def _brave_search_impl(query: str, count: int, time_filter: Optional[str] = None
def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
"""Search using DuckDuckGo via the duckduckgo-search library. No API key needed."""
def _resolve_url(raw: str) -> str:
"""Resolve DuckDuckGo redirect URL to the actual destination URL."""
if not raw:
return raw
# Handle protocol-relative URLs
resolved = raw
if resolved.startswith("//"):
resolved = "https:" + resolved
elif resolved.startswith("/"):
resolved = urljoin("https://html.duckduckgo.com", resolved)
# Extract the actual URL from DuckDuckGo's /l/?uddg= redirect
try:
parsed = urlparse(resolved)
if "duckduckgo.com" in (parsed.hostname or "") and parsed.path.rstrip("/") == "/l":
qs = parse_qs(parsed.query)
if "uddg" in qs:
return qs["uddg"][0]
except Exception:
pass
return resolved
def _html_fallback() -> List[dict]:
try:
response = httpx.get(
@@ -315,7 +337,7 @@ def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] =
link = result.select_one(".result__a")
if not link:
continue
url = link.get("href", "")
url = _resolve_url(link.get("href", ""))
if not url:
continue
snippet_el = result.select_one(".result__snippet")