From 8e494cc1c4b238799acb2d27a328817d6f44dba8 Mon Sep 17 00:00:00 2001 From: Mazen Tamer Salah <78306991+mazen-salah@users.noreply.github.com> Date: Mon, 8 Jun 2026 22:33:29 +0300 Subject: [PATCH] fix(chat): keep balanced trailing ')' when extracting URLs (#3406) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit extract_urls() stripped any trailing ')' unconditionally via `re.sub(r'[.,;:!?\)]+$', '', url)`. That corrupts URLs that legitimately end in a parenthesis — most commonly Wikipedia disambiguation links like https://en.wikipedia.org/wiki/Python_(programming_language), which became ...Python_(programming_language and then 404 when fetched by the web/research tools. Strip trailing sentence punctuation as before, but only drop a ')' when it is unbalanced (more ')' than '('), so a prose-glued "(see https://example.com)" still loses its closing paren while balanced URLs keep theirs. Added tests/test_extract_urls.py covering balanced, unbalanced, nested, and trailing-punctuation cases. --- src/chat_helpers.py | 9 ++++++++- tests/test_extract_urls.py | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) create mode 100644 tests/test_extract_urls.py diff --git a/src/chat_helpers.py b/src/chat_helpers.py index cea9cad08..a8f5f54a8 100644 --- a/src/chat_helpers.py +++ b/src/chat_helpers.py @@ -24,7 +24,14 @@ def extract_urls(text: str) -> List[str]: urls = re.findall(url_pattern, text) cleaned_urls = [] for url in urls: - url = re.sub(r'[.,;:!?\)]+$', '', url) + # Strip trailing sentence punctuation, but keep a balanced ')' so URLs + # that legitimately end in one are preserved, e.g. the Wikipedia link + # ".../Python_(programming_language)". A ')' is only dropped when it is + # unbalanced (more ')' than '('), which is the prose-glued case such as + # "(see https://example.com)". + url = re.sub(r'[.,;:!?]+$', '', url) + while url.endswith(')') and url.count(')') > url.count('('): + url = re.sub(r'[.,;:!?]+$', '', url[:-1]) cleaned_urls.append(url) return cleaned_urls diff --git a/tests/test_extract_urls.py b/tests/test_extract_urls.py new file mode 100644 index 000000000..44351318b --- /dev/null +++ b/tests/test_extract_urls.py @@ -0,0 +1,38 @@ +"""extract_urls must keep a *balanced* trailing ')' while still trimming +prose-glued punctuation. + +The old cleanup stripped any trailing ')' unconditionally, which corrupted URLs +that legitimately end in one (Wikipedia disambiguation links being the common +case). The fix only drops an *unbalanced* ')'. +""" +from src.chat_helpers import extract_urls + + +def test_keeps_balanced_trailing_paren(): + text = "see https://en.wikipedia.org/wiki/Python_(programming_language)" + assert extract_urls(text) == [ + "https://en.wikipedia.org/wiki/Python_(programming_language)" + ] + + +def test_strips_unbalanced_trailing_paren_from_prose(): + # The closing paren belongs to the sentence, not the URL. + assert extract_urls("(see https://example.com)") == ["https://example.com"] + + +def test_strips_trailing_sentence_punctuation(): + assert extract_urls("go to https://example.com.") == ["https://example.com"] + assert extract_urls("https://example.com, then continue") == [ + "https://example.com" + ] + + +def test_strips_trailing_punctuation_after_balanced_close(): + # Keep the balanced ')' but drop the sentence period after it. + text = "ref https://en.wikipedia.org/wiki/Foo_(bar)." + assert extract_urls(text) == ["https://en.wikipedia.org/wiki/Foo_(bar)"] + + +def test_nested_balanced_parens_preserved(): + text = "https://example.com/a_(b_(c))" + assert extract_urls(text) == ["https://example.com/a_(b_(c))"]