Add support for scraping from --list ID

2025-04-11 02:40:36 +01:00
parent c7715ae291
commit 6db32bbd20
3 changed files with 34 additions and 0 deletions
--- a/README.md
+++ b/README.md
@@ -103,6 +103,12 @@ options:                description
                            -ht javascript
                            --hashtag=javascript

+-l, --list              : List ID. Scrape tweets from a list. The
+                          ID is taken from the x.com/list/... URL.
+                          e.g.
+                           -l "1324132413151"
+                           --list "1324132413151"
+
 -q, --query             : Twitter query or search.
                          Scrape tweets from a query or search.
                          e.g.
--- a/scraper/main.py
+++ b/scraper/main.py
@@ -93,6 +93,14 @@ def main():
            help="Set no limit to the number of tweets to scrape (will scrap until no more tweets are available).",
        )

+        parser.add_argument(
+            "-l",
+            "--list",
+            type=str,
+            default=None,
+            help="List ID. Scrape tweets from a list.",
+        )
+
        parser.add_argument(
            "-q",
            "--query",
@@ -145,6 +153,8 @@ def main():
            tweet_type_args.append(args.username)
        if args.hashtag is not None:
            tweet_type_args.append(args.hashtag)
+        if args.list is not None:
+            tweet_type_args.append(args.list)
        if args.query is not None:
            tweet_type_args.append(args.query)
        if args.bookmarks is not False:
@@ -175,6 +185,7 @@ def main():
                scrape_hashtag=args.hashtag,
                scrape_bookmarks=args.bookmarks,
                scrape_query=args.query,
+                scrape_list=args.list,
                scrape_latest=args.latest,
                scrape_top=args.top,
                scrape_poster_details="pd" in additional_data,
--- a/scraper/twitter_scraper.py
+++ b/scraper/twitter_scraper.py
@@ -90,6 +90,7 @@ class Twitter_Scraper:
        scrape_hashtag=None,
        scrape_bookmarks=False,
        scrape_query=None,
+        scrape_list=None,
        scrape_latest=True,
        scrape_top=False,
        scrape_poster_details=False,
@@ -107,6 +108,7 @@ class Twitter_Scraper:
            else None,
            "bookmarks": scrape_bookmarks,
            "query": scrape_query,
+            "list": scrape_list,
            "tab": "Latest" if scrape_latest else "Top" if scrape_top else "Latest",
            "poster_details": scrape_poster_details,
        }
@@ -125,6 +127,9 @@ class Twitter_Scraper:
        elif scrape_query is not None:
            self.scraper_details["type"] = "Query"
            self.router = self.go_to_search
+        elif scrape_list is not None:
+            self.scraper_details["type"] = "List"
+            self.router = self.go_to_list
        else:
            self.scraper_details["type"] = "Home"
            self.router = self.go_to_home
@@ -383,6 +388,16 @@ It may be due to the following:
            sleep(3)
        pass

+    def go_to_list(self):
+        if self.scraper_details["list"] is None or self.scraper_details["list"] == "":
+            print("List is not set.")
+            sys.exit(1)
+        else:
+            url = f"https://x.com/i/lists/{self.scraper_details['list']}"
+            self.driver.get(url)
+            sleep(3)
+        pass
+
    def get_tweet_cards(self):
        self.tweet_cards = self.driver.find_elements(
            "xpath", '//article[@data-testid="tweet" and not(@disabled)]'
@@ -411,6 +426,7 @@ It may be due to the following:
        scrape_hashtag=None,
        scrape_bookmarks=False,
        scrape_query=None,
+        scrape_list=None,
        scrape_latest=True,
        scrape_top=False,
        scrape_poster_details=False,
@@ -422,6 +438,7 @@ It may be due to the following:
            scrape_hashtag,
            scrape_bookmarks,
            scrape_query,
+            scrape_list,
            scrape_latest,
            scrape_top,
            scrape_poster_details,