From 8e494cc1c4b238799acb2d27a328817d6f44dba8 Mon Sep 17 00:00:00 2001
From: Mazen Tamer Salah <78306991+mazen-salah@users.noreply.github.com>
Date: Mon, 8 Jun 2026 22:33:29 +0300
Subject: [PATCH] fix(chat): keep balanced trailing ')' when extracting URLs
 (#3406)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

extract_urls() stripped any trailing ')' unconditionally via
`re.sub(r'[.,;:!?\)]+$', '', url)`. That corrupts URLs that legitimately
end in a parenthesis — most commonly Wikipedia disambiguation links like
https://en.wikipedia.org/wiki/Python_(programming_language), which became
...Python_(programming_language and then 404 when fetched by the web/research
tools.

Strip trailing sentence punctuation as before, but only drop a ')' when it is
unbalanced (more ')' than '('), so a prose-glued "(see https://example.com)"
still loses its closing paren while balanced URLs keep theirs.

Added tests/test_extract_urls.py covering balanced, unbalanced, nested, and
trailing-punctuation cases.
---
 src/chat_helpers.py        |  9 ++++++++-
 tests/test_extract_urls.py | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_extract_urls.py

diff --git a/src/chat_helpers.py b/src/chat_helpers.py
index cea9cad08..a8f5f54a8 100644
--- a/src/chat_helpers.py
+++ b/src/chat_helpers.py
@@ -24,7 +24,14 @@ def extract_urls(text: str) -> List[str]:
     urls = re.findall(url_pattern, text)
     cleaned_urls = []
     for url in urls:
-        url = re.sub(r'[.,;:!?\)]+$', '', url)
+        # Strip trailing sentence punctuation, but keep a balanced ')' so URLs
+        # that legitimately end in one are preserved, e.g. the Wikipedia link
+        # ".../Python_(programming_language)". A ')' is only dropped when it is
+        # unbalanced (more ')' than '('), which is the prose-glued case such as
+        # "(see https://example.com)".
+        url = re.sub(r'[.,;:!?]+$', '', url)
+        while url.endswith(')') and url.count(')') > url.count('('):
+            url = re.sub(r'[.,;:!?]+$', '', url[:-1])
         cleaned_urls.append(url)
     return cleaned_urls
 
diff --git a/tests/test_extract_urls.py b/tests/test_extract_urls.py
new file mode 100644
index 000000000..44351318b
--- /dev/null
+++ b/tests/test_extract_urls.py
@@ -0,0 +1,38 @@
+"""extract_urls must keep a *balanced* trailing ')' while still trimming
+prose-glued punctuation.
+
+The old cleanup stripped any trailing ')' unconditionally, which corrupted URLs
+that legitimately end in one (Wikipedia disambiguation links being the common
+case). The fix only drops an *unbalanced* ')'.
+"""
+from src.chat_helpers import extract_urls
+
+
+def test_keeps_balanced_trailing_paren():
+    text = "see https://en.wikipedia.org/wiki/Python_(programming_language)"
+    assert extract_urls(text) == [
+        "https://en.wikipedia.org/wiki/Python_(programming_language)"
+    ]
+
+
+def test_strips_unbalanced_trailing_paren_from_prose():
+    # The closing paren belongs to the sentence, not the URL.
+    assert extract_urls("(see https://example.com)") == ["https://example.com"]
+
+
+def test_strips_trailing_sentence_punctuation():
+    assert extract_urls("go to https://example.com.") == ["https://example.com"]
+    assert extract_urls("https://example.com, then continue") == [
+        "https://example.com"
+    ]
+
+
+def test_strips_trailing_punctuation_after_balanced_close():
+    # Keep the balanced ')' but drop the sentence period after it.
+    text = "ref https://en.wikipedia.org/wiki/Foo_(bar)."
+    assert extract_urls(text) == ["https://en.wikipedia.org/wiki/Foo_(bar)"]
+
+
+def test_nested_balanced_parens_preserved():
+    text = "https://example.com/a_(b_(c))"
+    assert extract_urls(text) == ["https://example.com/a_(b_(c))"]