From 975b5197b895ab08d8bb287cf1457af24c2a1770 Mon Sep 17 00:00:00 2001 From: Jarrian Date: Thu, 28 Sep 2023 11:30:29 +0800 Subject: [PATCH] feat: scrape tweet link, tweet id, and user id --- scraper/tweet.py | 41 +++++++++++++++++++++++++++++++++++++- scraper/twitter_scraper.py | 8 ++++++-- 2 files changed, 46 insertions(+), 3 deletions(-) diff --git a/scraper/tweet.py b/scraper/tweet.py index 760c966..d9f5c24 100644 --- a/scraper/tweet.py +++ b/scraper/tweet.py @@ -148,8 +148,19 @@ class Tweet: except NoSuchElementException: self.profile_img = "" + try: + self.tweet_link = self.card.find_element( + "xpath", + ".//a[contains(@href, '/status/')]", + ).get_attribute("href") + self.tweet_id = str(self.tweet_link.split("/")[-1]) + except NoSuchElementException: + self.tweet_link = "" + self.tweet_id = "" + self.following_cnt = "0" self.followers_cnt = "0" + self.user_id = None if scrape_poster_details: el_name = card.find_element( @@ -157,11 +168,17 @@ class Tweet: ) ext_hover_card = False + ext_user_id = False ext_following = False ext_followers = False hover_attempt = 0 - while not ext_hover_card or not ext_following or not ext_followers: + while ( + not ext_hover_card + or not ext_user_id + or not ext_following + or not ext_followers + ): try: actions.move_to_element(el_name).perform() @@ -171,6 +188,25 @@ class Tweet: ext_hover_card = True + while not ext_user_id: + try: + raw_user_id = hover_card.find_element( + "xpath", + '(.//div[contains(@data-testid, "-follow")]) | (.//div[contains(@data-testid, "-unfollow")])', + ).get_attribute("data-testid") + + if raw_user_id == "": + self.user_id = None + else: + self.user_id = str(raw_user_id.split("-")[0]) + + ext_user_id = True + except NoSuchElementException: + continue + except StaleElementReferenceException: + self.error = True + return + while not ext_following: try: self.following_cnt = hover_card.find_element( @@ -231,6 +267,9 @@ class Tweet: self.mentions, self.emojis, self.profile_img, + self.tweet_link, + self.tweet_id, + self.user_id, self.following_cnt, self.followers_cnt, ) diff --git a/scraper/twitter_scraper.py b/scraper/twitter_scraper.py index b478a05..dc2b63a 100644 --- a/scraper/twitter_scraper.py +++ b/scraper/twitter_scraper.py @@ -502,16 +502,20 @@ It may be due to the following: "Mentions": [tweet[10] for tweet in self.data], "Emojis": [tweet[11] for tweet in self.data], "Profile Image": [tweet[12] for tweet in self.data], + "Tweet Link": [tweet[13] for tweet in self.data], + "Tweet ID": [f"tweet_id:{tweet[14]}" for tweet in self.data], } if self.scraper_details["poster_details"]: - data["Following"] = [tweet[13] for tweet in self.data] - data["Followers"] = [tweet[14] for tweet in self.data] + data["Tweeter ID"] = [f"user_id:{tweet[15]}" for tweet in self.data] + data["Following"] = [tweet[16] for tweet in self.data] + data["Followers"] = [tweet[17] for tweet in self.data] df = pd.DataFrame(data) current_time = now.strftime("%Y-%m-%d_%H-%M-%S") file_path = f"{folder_path}{current_time}_tweets_1-{len(self.data)}.csv" + pd.set_option("display.max_colwidth", None) df.to_csv(file_path, index=False, encoding="utf-8") print("CSV Saved: {}".format(file_path))