fix(youtube): consolidate duplicate handler

Make src.youtube_handler a compatibility wrapper around services.youtube.youtube_handler so transcript state, URL parsing, and timeout behavior no longer diverge.
2026-06-15 17:25:26 -04:00 · 2026-06-15 08:03:41 +02:00
parent 96052c5e8a
commit 4c41834dc7
3 changed files with 168 additions and 288 deletions
@@ -64,20 +64,40 @@ def is_youtube_url(url: str) -> bool:
    return "youtube.com" in url or "youtu.be" in url


+# youtube.com-shaped hosts. music.youtube.com serves the same /watch and
+# /shorts paths, so links shared from YouTube Music must resolve too.
+_YT_HOSTS = ("www.youtube.com", "youtube.com", "m.youtube.com", "music.youtube.com")
+# Path prefixes whose first following segment is the video id. Covers the
+# /embed/ player, Shorts (/shorts/), live streams (/live/), and the legacy
+# /v/ embed — all of which `is_youtube_url` already treats as YouTube, so
+# they must be extractable or the link is silently dropped (neither web-fetched
+# nor transcript-fetched) by the chat pipeline.
+_YT_PATH_PREFIXES = ("/embed/", "/shorts/", "/live/", "/v/")
+
+
 def extract_youtube_id(url: str) -> Optional[str]:
-    """Extract YouTube video ID from various URL formats."""
+    """Extract a YouTube video ID from the common URL shapes:
+    watch?v=, youtu.be/<id>, /embed/<id>, /shorts/<id>, /live/<id>, /v/<id>,
+    across youtube.com / m.youtube.com / music.youtube.com / youtu.be."""
    if not isinstance(url, str):
        return None
    parsed = urllib.parse.urlparse(url)
-    if parsed.hostname in ("www.youtube.com", "youtube.com", "m.youtube.com"):
+    host = (parsed.hostname or "").lower()
+    if host in _YT_HOSTS:
        if parsed.path == "/watch":
            params = urllib.parse.parse_qs(parsed.query)
-            if "v" in params:
+            if params.get("v"):
                return params["v"][0]
-        elif parsed.path.startswith("/embed/"):
-            return parsed.path.split("/")[-1]
-    elif parsed.hostname == "youtu.be":
-        return parsed.path[1:]
+        else:
+            for prefix in _YT_PATH_PREFIXES:
+                if parsed.path.startswith(prefix):
+                    vid = parsed.path[len(prefix):].split("/")[0]
+                    if vid:
+                        return vid
+    elif host == "youtu.be":
+        vid = parsed.path.lstrip("/").split("/")[0]
+        if vid:
+            return vid
    return None


@@ -170,6 +190,8 @@ def format_transcript_for_context(
    if segments:
        ctx += "Timestamped Transcript:\n"
        for seg in segments:
+            if not isinstance(seg, dict):
+                continue
            ctx += f"[{seg['timestamp']}] {seg['text']}\n"
        # Check length — fall back to plain text if too long
        if len(ctx) > 12000:
@@ -202,15 +224,24 @@ async def fetch_youtube_comments(
            f"https://www.youtube.com/watch?v={video_id}",
        ]

-        proc = await asyncio.wait_for(
-            asyncio.create_subprocess_exec(
-                *cmd,
-                stdout=asyncio.subprocess.PIPE,
-                stderr=asyncio.subprocess.PIPE,
-            ),
-            timeout=timeout,
+        proc = await asyncio.create_subprocess_exec(
+            *cmd,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
        )
-        stdout, stderr = await proc.communicate()
+        # Bound the wait on the process actually finishing, not on spawning it.
+        # create_subprocess_exec returns as soon as the child starts, so wrapping
+        # it in wait_for never enforces the timeout — proc.communicate() is the
+        # blocking step. Kill and reap the child if it overruns so it does not
+        # linger after we return.
+        try:
+            stdout, stderr = await asyncio.wait_for(
+                proc.communicate(), timeout=timeout
+            )
+        except asyncio.TimeoutError:
+            proc.kill()
+            await proc.wait()
+            raise

        if proc.returncode != 0:
            return {"success": False, "error": f"yt-dlp failed: {stderr.decode()[:200]}", "comments": []}