feat: scrape mentions and emojis
This commit is contained in:
67
main.ipynb
67
main.ipynb
@@ -17,7 +17,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 10,
|
"execution_count": 140,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@@ -55,7 +55,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 11,
|
"execution_count": 141,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@@ -95,7 +95,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 12,
|
"execution_count": 142,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@@ -139,7 +139,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 13,
|
"execution_count": 143,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@@ -219,13 +219,6 @@
|
|||||||
" self.analytics_cnt = \"0\"\n",
|
" self.analytics_cnt = \"0\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
" try:\n",
|
" try:\n",
|
||||||
" self.profile_img = card.find_element(\n",
|
|
||||||
" \"xpath\", './/div[@data-testid=\"Tweet-User-Avatar\"]//img'\n",
|
|
||||||
" ).get_attribute(\"src\")\n",
|
|
||||||
" except NoSuchElementException:\n",
|
|
||||||
" self.profile_img = \"\"\n",
|
|
||||||
"\n",
|
|
||||||
" try:\n",
|
|
||||||
" self.tags = card.find_elements(\n",
|
" self.tags = card.find_elements(\n",
|
||||||
" \"xpath\",\n",
|
" \"xpath\",\n",
|
||||||
" './/a[contains(@href, \"src=hashtag_click\")]',\n",
|
" './/a[contains(@href, \"src=hashtag_click\")]',\n",
|
||||||
@@ -235,6 +228,34 @@
|
|||||||
" except NoSuchElementException:\n",
|
" except NoSuchElementException:\n",
|
||||||
" self.tags = []\n",
|
" self.tags = []\n",
|
||||||
" \n",
|
" \n",
|
||||||
|
" try:\n",
|
||||||
|
" self.mentions = card.find_elements(\n",
|
||||||
|
" \"xpath\",\n",
|
||||||
|
" '(.//div[@data-testid=\"tweetText\"])[1]//a[contains(text(), \"@\")]',\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
" self.mentions = [mention.text for mention in self.mentions]\n",
|
||||||
|
" except NoSuchElementException:\n",
|
||||||
|
" self.mentions = []\n",
|
||||||
|
" \n",
|
||||||
|
" try:\n",
|
||||||
|
" raw_emojis = card.find_elements(\n",
|
||||||
|
" \"xpath\",\n",
|
||||||
|
" '(.//div[@data-testid=\"tweetText\"])[1]/img[contains(@src, \"emoji\")]',\n",
|
||||||
|
" )\n",
|
||||||
|
" \n",
|
||||||
|
" # self.emojis = [emoji.get_attribute(\"alt\").encode(\"utf-8\") for emoji in raw_emojis]\n",
|
||||||
|
" self.emojis = [emoji.get_attribute(\"alt\").encode(\"unicode-escape\").decode(\"ASCII\") for emoji in raw_emojis]\n",
|
||||||
|
" except NoSuchElementException:\n",
|
||||||
|
" self.emojis = []\n",
|
||||||
|
" \n",
|
||||||
|
" try:\n",
|
||||||
|
" self.profile_img = card.find_element(\n",
|
||||||
|
" \"xpath\", './/div[@data-testid=\"Tweet-User-Avatar\"]//img'\n",
|
||||||
|
" ).get_attribute(\"src\")\n",
|
||||||
|
" except NoSuchElementException:\n",
|
||||||
|
" self.profile_img = \"\"\n",
|
||||||
|
"\n",
|
||||||
" self.tweet = (\n",
|
" self.tweet = (\n",
|
||||||
" self.user,\n",
|
" self.user,\n",
|
||||||
" self.handle,\n",
|
" self.handle,\n",
|
||||||
@@ -246,6 +267,8 @@
|
|||||||
" self.like_cnt,\n",
|
" self.like_cnt,\n",
|
||||||
" self.analytics_cnt,\n",
|
" self.analytics_cnt,\n",
|
||||||
" self.tags,\n",
|
" self.tags,\n",
|
||||||
|
" self.mentions,\n",
|
||||||
|
" self.emojis,\n",
|
||||||
" self.profile_img,\n",
|
" self.profile_img,\n",
|
||||||
" )\n",
|
" )\n",
|
||||||
"\n",
|
"\n",
|
||||||
@@ -264,7 +287,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 14,
|
"execution_count": 144,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@@ -334,6 +357,7 @@
|
|||||||
" \"tab\": \"Latest\" if scrape_latest else \"Top\" if scrape_top else \"Latest\",\n",
|
" \"tab\": \"Latest\" if scrape_latest else \"Top\" if scrape_top else \"Latest\",\n",
|
||||||
" }\n",
|
" }\n",
|
||||||
" self.router = self.go_to_home\n",
|
" self.router = self.go_to_home\n",
|
||||||
|
" self.scroller = Scroller(self.driver)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" if scrape_username is not None:\n",
|
" if scrape_username is not None:\n",
|
||||||
" self.scraper_details[\"type\"] = \"Username\"\n",
|
" self.scraper_details[\"type\"] = \"Username\"\n",
|
||||||
@@ -708,14 +732,16 @@
|
|||||||
" \"Likes\": [tweet[7] for tweet in self.data],\n",
|
" \"Likes\": [tweet[7] for tweet in self.data],\n",
|
||||||
" \"Analytics\": [tweet[8] for tweet in self.data],\n",
|
" \"Analytics\": [tweet[8] for tweet in self.data],\n",
|
||||||
" \"Tags\": [tweet[9] for tweet in self.data],\n",
|
" \"Tags\": [tweet[9] for tweet in self.data],\n",
|
||||||
" \"Profile Image\": [tweet[10] for tweet in self.data],\n",
|
" \"Mentions\": [tweet[10] for tweet in self.data],\n",
|
||||||
|
" \"Emojis\": [tweet[11] for tweet in self.data],\n",
|
||||||
|
" \"Profile Image\": [tweet[12] for tweet in self.data],\n",
|
||||||
" }\n",
|
" }\n",
|
||||||
"\n",
|
"\n",
|
||||||
" df = pd.DataFrame(data)\n",
|
" df = pd.DataFrame(data)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" current_time = now.strftime(\"%Y-%m-%d_%H-%M-%S\")\n",
|
" current_time = now.strftime(\"%Y-%m-%d_%H-%M-%S\")\n",
|
||||||
" file_path = f\"{folder_path}{current_time}_tweets_1-{len(self.data)}.csv\"\n",
|
" file_path = f\"{folder_path}{current_time}_tweets_1-{len(self.data)}.csv\"\n",
|
||||||
" df.to_csv(file_path, index=False)\n",
|
" df.to_csv(file_path, index=False, encoding=\"utf-8\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
" print(\"CSV Saved: {}\".format(file_path))\n",
|
" print(\"CSV Saved: {}\".format(file_path))\n",
|
||||||
"\n",
|
"\n",
|
||||||
@@ -735,7 +761,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 27,
|
"execution_count": 145,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@@ -760,7 +786,7 @@
|
|||||||
" username=USER_UNAME,\n",
|
" username=USER_UNAME,\n",
|
||||||
" password=USER_PASSWORD,\n",
|
" password=USER_PASSWORD,\n",
|
||||||
" # max_tweets=10,\n",
|
" # max_tweets=10,\n",
|
||||||
" # scrape_username=\"\",\n",
|
" # scrape_username=\"something\",\n",
|
||||||
" # scrape_hashtag=\"something\",\n",
|
" # scrape_hashtag=\"something\",\n",
|
||||||
" # scrape_query=\"something\",\n",
|
" # scrape_query=\"something\",\n",
|
||||||
" # scrape_latest=True,\n",
|
" # scrape_latest=True,\n",
|
||||||
@@ -778,7 +804,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 28,
|
"execution_count": 146,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@@ -798,6 +824,7 @@
|
|||||||
" # max_tweets=10,\n",
|
" # max_tweets=10,\n",
|
||||||
" # scrape_username=\"something\",\n",
|
" # scrape_username=\"something\",\n",
|
||||||
" # scrape_hashtag=\"something\",\n",
|
" # scrape_hashtag=\"something\",\n",
|
||||||
|
" # scrape_hashtag=\"something\",\n",
|
||||||
" # scrape_query=\"something\",\n",
|
" # scrape_query=\"something\",\n",
|
||||||
" # scrape_latest=True,\n",
|
" # scrape_latest=True,\n",
|
||||||
" # scrape_top=False,\n",
|
" # scrape_top=False,\n",
|
||||||
@@ -814,7 +841,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 29,
|
"execution_count": 147,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@@ -822,7 +849,7 @@
|
|||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"Saving Tweets to CSV...\n",
|
"Saving Tweets to CSV...\n",
|
||||||
"CSV Saved: ./tweets/2023-09-20_09-26-30_tweets_1-50.csv\n"
|
"CSV Saved: ./tweets/2023-09-23_09-54-41_tweets_1-50.csv\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@@ -832,7 +859,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 148,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
|||||||
@@ -77,13 +77,6 @@ class Tweet:
|
|||||||
except NoSuchElementException:
|
except NoSuchElementException:
|
||||||
self.analytics_cnt = "0"
|
self.analytics_cnt = "0"
|
||||||
|
|
||||||
try:
|
|
||||||
self.profile_img = card.find_element(
|
|
||||||
"xpath", './/div[@data-testid="Tweet-User-Avatar"]//img'
|
|
||||||
).get_attribute("src")
|
|
||||||
except NoSuchElementException:
|
|
||||||
self.profile_img = ""
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.tags = card.find_elements(
|
self.tags = card.find_elements(
|
||||||
"xpath",
|
"xpath",
|
||||||
@@ -94,6 +87,36 @@ class Tweet:
|
|||||||
except NoSuchElementException:
|
except NoSuchElementException:
|
||||||
self.tags = []
|
self.tags = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.mentions = card.find_elements(
|
||||||
|
"xpath",
|
||||||
|
'(.//div[@data-testid="tweetText"])[1]//a[contains(text(), "@")]',
|
||||||
|
)
|
||||||
|
|
||||||
|
self.mentions = [mention.text for mention in self.mentions]
|
||||||
|
except NoSuchElementException:
|
||||||
|
self.mentions = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
raw_emojis = card.find_elements(
|
||||||
|
"xpath",
|
||||||
|
'(.//div[@data-testid="tweetText"])[1]/img[contains(@src, "emoji")]',
|
||||||
|
)
|
||||||
|
|
||||||
|
self.emojis = [
|
||||||
|
emoji.get_attribute("alt").encode("unicode-escape").decode("ASCII")
|
||||||
|
for emoji in raw_emojis
|
||||||
|
]
|
||||||
|
except NoSuchElementException:
|
||||||
|
self.emojis = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.profile_img = card.find_element(
|
||||||
|
"xpath", './/div[@data-testid="Tweet-User-Avatar"]//img'
|
||||||
|
).get_attribute("src")
|
||||||
|
except NoSuchElementException:
|
||||||
|
self.profile_img = ""
|
||||||
|
|
||||||
self.tweet = (
|
self.tweet = (
|
||||||
self.user,
|
self.user,
|
||||||
self.handle,
|
self.handle,
|
||||||
@@ -105,6 +128,8 @@ class Tweet:
|
|||||||
self.like_cnt,
|
self.like_cnt,
|
||||||
self.analytics_cnt,
|
self.analytics_cnt,
|
||||||
self.tags,
|
self.tags,
|
||||||
|
self.mentions,
|
||||||
|
self.emojis,
|
||||||
self.profile_img,
|
self.profile_img,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -88,6 +88,7 @@ class Twitter_Scraper:
|
|||||||
"tab": "Latest" if scrape_latest else "Top" if scrape_top else "Latest",
|
"tab": "Latest" if scrape_latest else "Top" if scrape_top else "Latest",
|
||||||
}
|
}
|
||||||
self.router = self.go_to_home
|
self.router = self.go_to_home
|
||||||
|
self.scroller = Scroller(self.driver)
|
||||||
|
|
||||||
if scrape_username is not None:
|
if scrape_username is not None:
|
||||||
self.scraper_details["type"] = "Username"
|
self.scraper_details["type"] = "Username"
|
||||||
@@ -462,14 +463,16 @@ It may be due to the following:
|
|||||||
"Likes": [tweet[7] for tweet in self.data],
|
"Likes": [tweet[7] for tweet in self.data],
|
||||||
"Analytics": [tweet[8] for tweet in self.data],
|
"Analytics": [tweet[8] for tweet in self.data],
|
||||||
"Tags": [tweet[9] for tweet in self.data],
|
"Tags": [tweet[9] for tweet in self.data],
|
||||||
"Profile Image": [tweet[10] for tweet in self.data],
|
"Mentions": [tweet[10] for tweet in self.data],
|
||||||
|
"Emojis": [tweet[11] for tweet in self.data],
|
||||||
|
"Profile Image": [tweet[12] for tweet in self.data],
|
||||||
}
|
}
|
||||||
|
|
||||||
df = pd.DataFrame(data)
|
df = pd.DataFrame(data)
|
||||||
|
|
||||||
current_time = now.strftime("%Y-%m-%d_%H-%M-%S")
|
current_time = now.strftime("%Y-%m-%d_%H-%M-%S")
|
||||||
file_path = f"{folder_path}{current_time}_tweets_1-{len(self.data)}.csv"
|
file_path = f"{folder_path}{current_time}_tweets_1-{len(self.data)}.csv"
|
||||||
df.to_csv(file_path, index=False)
|
df.to_csv(file_path, index=False, encoding="utf-8")
|
||||||
|
|
||||||
print("CSV Saved: {}".format(file_path))
|
print("CSV Saved: {}".format(file_path))
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user