mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-17 10:15:27 -04:00
Merge pull request #809 from BSG-Walter/main
fix: resolve DuckDuckGo redirect URLs in HTML fallback search
This commit is contained in:
@@ -4,6 +4,7 @@ import json
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
from urllib.parse import urljoin, urlparse, parse_qs
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
@@ -299,6 +300,25 @@ def _brave_search_impl(query: str, count: int, time_filter: Optional[str] = None
|
|||||||
|
|
||||||
def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
|
def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
|
||||||
"""Search using DuckDuckGo via the duckduckgo-search library. No API key needed."""
|
"""Search using DuckDuckGo via the duckduckgo-search library. No API key needed."""
|
||||||
|
def _resolve_url(raw: str) -> str:
|
||||||
|
"""Resolve DuckDuckGo redirect URL to the actual destination URL."""
|
||||||
|
if not raw:
|
||||||
|
return raw
|
||||||
|
resolved = raw
|
||||||
|
if resolved.startswith("//"):
|
||||||
|
resolved = "https:" + resolved
|
||||||
|
elif resolved.startswith("/"):
|
||||||
|
resolved = urljoin("https://html.duckduckgo.com", resolved)
|
||||||
|
try:
|
||||||
|
parsed = urlparse(resolved)
|
||||||
|
if "duckduckgo.com" in (parsed.hostname or "") and parsed.path.rstrip("/") == "/l":
|
||||||
|
qs = parse_qs(parsed.query)
|
||||||
|
if "uddg" in qs:
|
||||||
|
return qs["uddg"][0]
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return resolved
|
||||||
|
|
||||||
def _html_fallback() -> List[dict]:
|
def _html_fallback() -> List[dict]:
|
||||||
try:
|
try:
|
||||||
response = httpx.get(
|
response = httpx.get(
|
||||||
@@ -314,7 +334,7 @@ def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] =
|
|||||||
link = result.select_one(".result__a")
|
link = result.select_one(".result__a")
|
||||||
if not link:
|
if not link:
|
||||||
continue
|
continue
|
||||||
url = link.get("href", "")
|
url = _resolve_url(link.get("href", ""))
|
||||||
if not url:
|
if not url:
|
||||||
continue
|
continue
|
||||||
snippet_el = result.select_one(".result__snippet")
|
snippet_el = result.select_one(".result__snippet")
|
||||||
|
|||||||
+23
-1
@@ -4,6 +4,7 @@ import json
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
from urllib.parse import urljoin, urlparse, parse_qs
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
@@ -300,6 +301,27 @@ def _brave_search_impl(query: str, count: int, time_filter: Optional[str] = None
|
|||||||
|
|
||||||
def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
|
def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
|
||||||
"""Search using DuckDuckGo via the duckduckgo-search library. No API key needed."""
|
"""Search using DuckDuckGo via the duckduckgo-search library. No API key needed."""
|
||||||
|
def _resolve_url(raw: str) -> str:
|
||||||
|
"""Resolve DuckDuckGo redirect URL to the actual destination URL."""
|
||||||
|
if not raw:
|
||||||
|
return raw
|
||||||
|
# Handle protocol-relative URLs
|
||||||
|
resolved = raw
|
||||||
|
if resolved.startswith("//"):
|
||||||
|
resolved = "https:" + resolved
|
||||||
|
elif resolved.startswith("/"):
|
||||||
|
resolved = urljoin("https://html.duckduckgo.com", resolved)
|
||||||
|
# Extract the actual URL from DuckDuckGo's /l/?uddg= redirect
|
||||||
|
try:
|
||||||
|
parsed = urlparse(resolved)
|
||||||
|
if "duckduckgo.com" in (parsed.hostname or "") and parsed.path.rstrip("/") == "/l":
|
||||||
|
qs = parse_qs(parsed.query)
|
||||||
|
if "uddg" in qs:
|
||||||
|
return qs["uddg"][0]
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return resolved
|
||||||
|
|
||||||
def _html_fallback() -> List[dict]:
|
def _html_fallback() -> List[dict]:
|
||||||
try:
|
try:
|
||||||
response = httpx.get(
|
response = httpx.get(
|
||||||
@@ -315,7 +337,7 @@ def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] =
|
|||||||
link = result.select_one(".result__a")
|
link = result.select_one(".result__a")
|
||||||
if not link:
|
if not link:
|
||||||
continue
|
continue
|
||||||
url = link.get("href", "")
|
url = _resolve_url(link.get("href", ""))
|
||||||
if not url:
|
if not url:
|
||||||
continue
|
continue
|
||||||
snippet_el = result.select_one(".result__snippet")
|
snippet_el = result.select_one(".result__snippet")
|
||||||
|
|||||||
Reference in New Issue
Block a user