mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-17 10:15:27 -04:00
fix(chat): keep balanced trailing ')' when extracting URLs (#3406)
extract_urls() stripped any trailing ')' unconditionally via `re.sub(r'[.,;:!?\)]+$', '', url)`. That corrupts URLs that legitimately end in a parenthesis — most commonly Wikipedia disambiguation links like https://en.wikipedia.org/wiki/Python_(programming_language), which became ...Python_(programming_language and then 404 when fetched by the web/research tools. Strip trailing sentence punctuation as before, but only drop a ')' when it is unbalanced (more ')' than '('), so a prose-glued "(see https://example.com)" still loses its closing paren while balanced URLs keep theirs. Added tests/test_extract_urls.py covering balanced, unbalanced, nested, and trailing-punctuation cases.
This commit is contained in:
committed by
GitHub
parent
932b7f2446
commit
8e494cc1c4
+8
-1
@@ -24,7 +24,14 @@ def extract_urls(text: str) -> List[str]:
|
||||
urls = re.findall(url_pattern, text)
|
||||
cleaned_urls = []
|
||||
for url in urls:
|
||||
url = re.sub(r'[.,;:!?\)]+$', '', url)
|
||||
# Strip trailing sentence punctuation, but keep a balanced ')' so URLs
|
||||
# that legitimately end in one are preserved, e.g. the Wikipedia link
|
||||
# ".../Python_(programming_language)". A ')' is only dropped when it is
|
||||
# unbalanced (more ')' than '('), which is the prose-glued case such as
|
||||
# "(see https://example.com)".
|
||||
url = re.sub(r'[.,;:!?]+$', '', url)
|
||||
while url.endswith(')') and url.count(')') > url.count('('):
|
||||
url = re.sub(r'[.,;:!?]+$', '', url[:-1])
|
||||
cleaned_urls.append(url)
|
||||
return cleaned_urls
|
||||
|
||||
|
||||
@@ -0,0 +1,38 @@
|
||||
"""extract_urls must keep a *balanced* trailing ')' while still trimming
|
||||
prose-glued punctuation.
|
||||
|
||||
The old cleanup stripped any trailing ')' unconditionally, which corrupted URLs
|
||||
that legitimately end in one (Wikipedia disambiguation links being the common
|
||||
case). The fix only drops an *unbalanced* ')'.
|
||||
"""
|
||||
from src.chat_helpers import extract_urls
|
||||
|
||||
|
||||
def test_keeps_balanced_trailing_paren():
|
||||
text = "see https://en.wikipedia.org/wiki/Python_(programming_language)"
|
||||
assert extract_urls(text) == [
|
||||
"https://en.wikipedia.org/wiki/Python_(programming_language)"
|
||||
]
|
||||
|
||||
|
||||
def test_strips_unbalanced_trailing_paren_from_prose():
|
||||
# The closing paren belongs to the sentence, not the URL.
|
||||
assert extract_urls("(see https://example.com)") == ["https://example.com"]
|
||||
|
||||
|
||||
def test_strips_trailing_sentence_punctuation():
|
||||
assert extract_urls("go to https://example.com.") == ["https://example.com"]
|
||||
assert extract_urls("https://example.com, then continue") == [
|
||||
"https://example.com"
|
||||
]
|
||||
|
||||
|
||||
def test_strips_trailing_punctuation_after_balanced_close():
|
||||
# Keep the balanced ')' but drop the sentence period after it.
|
||||
text = "ref https://en.wikipedia.org/wiki/Foo_(bar)."
|
||||
assert extract_urls(text) == ["https://en.wikipedia.org/wiki/Foo_(bar)"]
|
||||
|
||||
|
||||
def test_nested_balanced_parens_preserved():
|
||||
text = "https://example.com/a_(b_(c))"
|
||||
assert extract_urls(text) == ["https://example.com/a_(b_(c))"]
|
||||
Reference in New Issue
Block a user