diff --git a/src/chat_helpers.py b/src/chat_helpers.py index cea9cad08..a8f5f54a8 100644 --- a/src/chat_helpers.py +++ b/src/chat_helpers.py @@ -24,7 +24,14 @@ def extract_urls(text: str) -> List[str]: urls = re.findall(url_pattern, text) cleaned_urls = [] for url in urls: - url = re.sub(r'[.,;:!?\)]+$', '', url) + # Strip trailing sentence punctuation, but keep a balanced ')' so URLs + # that legitimately end in one are preserved, e.g. the Wikipedia link + # ".../Python_(programming_language)". A ')' is only dropped when it is + # unbalanced (more ')' than '('), which is the prose-glued case such as + # "(see https://example.com)". + url = re.sub(r'[.,;:!?]+$', '', url) + while url.endswith(')') and url.count(')') > url.count('('): + url = re.sub(r'[.,;:!?]+$', '', url[:-1]) cleaned_urls.append(url) return cleaned_urls diff --git a/tests/test_extract_urls.py b/tests/test_extract_urls.py new file mode 100644 index 000000000..44351318b --- /dev/null +++ b/tests/test_extract_urls.py @@ -0,0 +1,38 @@ +"""extract_urls must keep a *balanced* trailing ')' while still trimming +prose-glued punctuation. + +The old cleanup stripped any trailing ')' unconditionally, which corrupted URLs +that legitimately end in one (Wikipedia disambiguation links being the common +case). The fix only drops an *unbalanced* ')'. +""" +from src.chat_helpers import extract_urls + + +def test_keeps_balanced_trailing_paren(): + text = "see https://en.wikipedia.org/wiki/Python_(programming_language)" + assert extract_urls(text) == [ + "https://en.wikipedia.org/wiki/Python_(programming_language)" + ] + + +def test_strips_unbalanced_trailing_paren_from_prose(): + # The closing paren belongs to the sentence, not the URL. + assert extract_urls("(see https://example.com)") == ["https://example.com"] + + +def test_strips_trailing_sentence_punctuation(): + assert extract_urls("go to https://example.com.") == ["https://example.com"] + assert extract_urls("https://example.com, then continue") == [ + "https://example.com" + ] + + +def test_strips_trailing_punctuation_after_balanced_close(): + # Keep the balanced ')' but drop the sentence period after it. + text = "ref https://en.wikipedia.org/wiki/Foo_(bar)." + assert extract_urls(text) == ["https://en.wikipedia.org/wiki/Foo_(bar)"] + + +def test_nested_balanced_parens_preserved(): + text = "https://example.com/a_(b_(c))" + assert extract_urls(text) == ["https://example.com/a_(b_(c))"]