feat: scrape tweet link, tweet id, and user id
This commit is contained in:
@@ -148,8 +148,19 @@ class Tweet:
|
|||||||
except NoSuchElementException:
|
except NoSuchElementException:
|
||||||
self.profile_img = ""
|
self.profile_img = ""
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.tweet_link = self.card.find_element(
|
||||||
|
"xpath",
|
||||||
|
".//a[contains(@href, '/status/')]",
|
||||||
|
).get_attribute("href")
|
||||||
|
self.tweet_id = str(self.tweet_link.split("/")[-1])
|
||||||
|
except NoSuchElementException:
|
||||||
|
self.tweet_link = ""
|
||||||
|
self.tweet_id = ""
|
||||||
|
|
||||||
self.following_cnt = "0"
|
self.following_cnt = "0"
|
||||||
self.followers_cnt = "0"
|
self.followers_cnt = "0"
|
||||||
|
self.user_id = None
|
||||||
|
|
||||||
if scrape_poster_details:
|
if scrape_poster_details:
|
||||||
el_name = card.find_element(
|
el_name = card.find_element(
|
||||||
@@ -157,11 +168,17 @@ class Tweet:
|
|||||||
)
|
)
|
||||||
|
|
||||||
ext_hover_card = False
|
ext_hover_card = False
|
||||||
|
ext_user_id = False
|
||||||
ext_following = False
|
ext_following = False
|
||||||
ext_followers = False
|
ext_followers = False
|
||||||
hover_attempt = 0
|
hover_attempt = 0
|
||||||
|
|
||||||
while not ext_hover_card or not ext_following or not ext_followers:
|
while (
|
||||||
|
not ext_hover_card
|
||||||
|
or not ext_user_id
|
||||||
|
or not ext_following
|
||||||
|
or not ext_followers
|
||||||
|
):
|
||||||
try:
|
try:
|
||||||
actions.move_to_element(el_name).perform()
|
actions.move_to_element(el_name).perform()
|
||||||
|
|
||||||
@@ -171,6 +188,25 @@ class Tweet:
|
|||||||
|
|
||||||
ext_hover_card = True
|
ext_hover_card = True
|
||||||
|
|
||||||
|
while not ext_user_id:
|
||||||
|
try:
|
||||||
|
raw_user_id = hover_card.find_element(
|
||||||
|
"xpath",
|
||||||
|
'(.//div[contains(@data-testid, "-follow")]) | (.//div[contains(@data-testid, "-unfollow")])',
|
||||||
|
).get_attribute("data-testid")
|
||||||
|
|
||||||
|
if raw_user_id == "":
|
||||||
|
self.user_id = None
|
||||||
|
else:
|
||||||
|
self.user_id = str(raw_user_id.split("-")[0])
|
||||||
|
|
||||||
|
ext_user_id = True
|
||||||
|
except NoSuchElementException:
|
||||||
|
continue
|
||||||
|
except StaleElementReferenceException:
|
||||||
|
self.error = True
|
||||||
|
return
|
||||||
|
|
||||||
while not ext_following:
|
while not ext_following:
|
||||||
try:
|
try:
|
||||||
self.following_cnt = hover_card.find_element(
|
self.following_cnt = hover_card.find_element(
|
||||||
@@ -231,6 +267,9 @@ class Tweet:
|
|||||||
self.mentions,
|
self.mentions,
|
||||||
self.emojis,
|
self.emojis,
|
||||||
self.profile_img,
|
self.profile_img,
|
||||||
|
self.tweet_link,
|
||||||
|
self.tweet_id,
|
||||||
|
self.user_id,
|
||||||
self.following_cnt,
|
self.following_cnt,
|
||||||
self.followers_cnt,
|
self.followers_cnt,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -502,16 +502,20 @@ It may be due to the following:
|
|||||||
"Mentions": [tweet[10] for tweet in self.data],
|
"Mentions": [tweet[10] for tweet in self.data],
|
||||||
"Emojis": [tweet[11] for tweet in self.data],
|
"Emojis": [tweet[11] for tweet in self.data],
|
||||||
"Profile Image": [tweet[12] for tweet in self.data],
|
"Profile Image": [tweet[12] for tweet in self.data],
|
||||||
|
"Tweet Link": [tweet[13] for tweet in self.data],
|
||||||
|
"Tweet ID": [f"tweet_id:{tweet[14]}" for tweet in self.data],
|
||||||
}
|
}
|
||||||
|
|
||||||
if self.scraper_details["poster_details"]:
|
if self.scraper_details["poster_details"]:
|
||||||
data["Following"] = [tweet[13] for tweet in self.data]
|
data["Tweeter ID"] = [f"user_id:{tweet[15]}" for tweet in self.data]
|
||||||
data["Followers"] = [tweet[14] for tweet in self.data]
|
data["Following"] = [tweet[16] for tweet in self.data]
|
||||||
|
data["Followers"] = [tweet[17] for tweet in self.data]
|
||||||
|
|
||||||
df = pd.DataFrame(data)
|
df = pd.DataFrame(data)
|
||||||
|
|
||||||
current_time = now.strftime("%Y-%m-%d_%H-%M-%S")
|
current_time = now.strftime("%Y-%m-%d_%H-%M-%S")
|
||||||
file_path = f"{folder_path}{current_time}_tweets_1-{len(self.data)}.csv"
|
file_path = f"{folder_path}{current_time}_tweets_1-{len(self.data)}.csv"
|
||||||
|
pd.set_option("display.max_colwidth", None)
|
||||||
df.to_csv(file_path, index=False, encoding="utf-8")
|
df.to_csv(file_path, index=False, encoding="utf-8")
|
||||||
|
|
||||||
print("CSV Saved: {}".format(file_path))
|
print("CSV Saved: {}".format(file_path))
|
||||||
|
|||||||
Reference in New Issue
Block a user