From 6db32bbd200ceb47401a85b7bb066a73a0485fa1 Mon Sep 17 00:00:00 2001 From: occasionallydavid <62972099+occasionallydavid@users.noreply.github.com> Date: Fri, 11 Apr 2025 02:40:36 +0100 Subject: [PATCH] Add support for scraping from --list ID --- README.md | 6 ++++++ scraper/__main__.py | 11 +++++++++++ scraper/twitter_scraper.py | 17 +++++++++++++++++ 3 files changed, 34 insertions(+) diff --git a/README.md b/README.md index 8d6c068..f32ca95 100644 --- a/README.md +++ b/README.md @@ -103,6 +103,12 @@ options: description -ht javascript --hashtag=javascript +-l, --list : List ID. Scrape tweets from a list. The + ID is taken from the x.com/list/... URL. + e.g. + -l "1324132413151" + --list "1324132413151" + -q, --query : Twitter query or search. Scrape tweets from a query or search. e.g. diff --git a/scraper/__main__.py b/scraper/__main__.py index c572467..89024cc 100644 --- a/scraper/__main__.py +++ b/scraper/__main__.py @@ -93,6 +93,14 @@ def main(): help="Set no limit to the number of tweets to scrape (will scrap until no more tweets are available).", ) + parser.add_argument( + "-l", + "--list", + type=str, + default=None, + help="List ID. Scrape tweets from a list.", + ) + parser.add_argument( "-q", "--query", @@ -145,6 +153,8 @@ def main(): tweet_type_args.append(args.username) if args.hashtag is not None: tweet_type_args.append(args.hashtag) + if args.list is not None: + tweet_type_args.append(args.list) if args.query is not None: tweet_type_args.append(args.query) if args.bookmarks is not False: @@ -175,6 +185,7 @@ def main(): scrape_hashtag=args.hashtag, scrape_bookmarks=args.bookmarks, scrape_query=args.query, + scrape_list=args.list, scrape_latest=args.latest, scrape_top=args.top, scrape_poster_details="pd" in additional_data, diff --git a/scraper/twitter_scraper.py b/scraper/twitter_scraper.py index 6595bc4..fd34e78 100644 --- a/scraper/twitter_scraper.py +++ b/scraper/twitter_scraper.py @@ -90,6 +90,7 @@ class Twitter_Scraper: scrape_hashtag=None, scrape_bookmarks=False, scrape_query=None, + scrape_list=None, scrape_latest=True, scrape_top=False, scrape_poster_details=False, @@ -107,6 +108,7 @@ class Twitter_Scraper: else None, "bookmarks": scrape_bookmarks, "query": scrape_query, + "list": scrape_list, "tab": "Latest" if scrape_latest else "Top" if scrape_top else "Latest", "poster_details": scrape_poster_details, } @@ -125,6 +127,9 @@ class Twitter_Scraper: elif scrape_query is not None: self.scraper_details["type"] = "Query" self.router = self.go_to_search + elif scrape_list is not None: + self.scraper_details["type"] = "List" + self.router = self.go_to_list else: self.scraper_details["type"] = "Home" self.router = self.go_to_home @@ -383,6 +388,16 @@ It may be due to the following: sleep(3) pass + def go_to_list(self): + if self.scraper_details["list"] is None or self.scraper_details["list"] == "": + print("List is not set.") + sys.exit(1) + else: + url = f"https://x.com/i/lists/{self.scraper_details['list']}" + self.driver.get(url) + sleep(3) + pass + def get_tweet_cards(self): self.tweet_cards = self.driver.find_elements( "xpath", '//article[@data-testid="tweet" and not(@disabled)]' @@ -411,6 +426,7 @@ It may be due to the following: scrape_hashtag=None, scrape_bookmarks=False, scrape_query=None, + scrape_list=None, scrape_latest=True, scrape_top=False, scrape_poster_details=False, @@ -422,6 +438,7 @@ It may be due to the following: scrape_hashtag, scrape_bookmarks, scrape_query, + scrape_list, scrape_latest, scrape_top, scrape_poster_details,