From 407a7174383a3c8930fd84e1ee48de64ff35f99e Mon Sep 17 00:00:00 2001 From: Jarrian Date: Sat, 23 Sep 2023 10:00:17 +0800 Subject: [PATCH] feat: scrape mentions and emojis --- main.ipynb | 67 ++++++++++++++++++++++++++------------ scraper/tweet.py | 39 ++++++++++++++++++---- scraper/twitter_scraper.py | 7 ++-- 3 files changed, 84 insertions(+), 29 deletions(-) diff --git a/main.ipynb b/main.ipynb index b3fc24d..5f45a51 100644 --- a/main.ipynb +++ b/main.ipynb @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 140, "metadata": {}, "outputs": [], "source": [ @@ -55,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 141, "metadata": {}, "outputs": [], "source": [ @@ -95,7 +95,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 142, "metadata": {}, "outputs": [], "source": [ @@ -139,7 +139,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 143, "metadata": {}, "outputs": [], "source": [ @@ -219,13 +219,6 @@ " self.analytics_cnt = \"0\"\n", "\n", " try:\n", - " self.profile_img = card.find_element(\n", - " \"xpath\", './/div[@data-testid=\"Tweet-User-Avatar\"]//img'\n", - " ).get_attribute(\"src\")\n", - " except NoSuchElementException:\n", - " self.profile_img = \"\"\n", - "\n", - " try:\n", " self.tags = card.find_elements(\n", " \"xpath\",\n", " './/a[contains(@href, \"src=hashtag_click\")]',\n", @@ -234,6 +227,34 @@ " self.tags = [tag.text for tag in self.tags]\n", " except NoSuchElementException:\n", " self.tags = []\n", + " \n", + " try:\n", + " self.mentions = card.find_elements(\n", + " \"xpath\",\n", + " '(.//div[@data-testid=\"tweetText\"])[1]//a[contains(text(), \"@\")]',\n", + " )\n", + "\n", + " self.mentions = [mention.text for mention in self.mentions]\n", + " except NoSuchElementException:\n", + " self.mentions = []\n", + " \n", + " try:\n", + " raw_emojis = card.find_elements(\n", + " \"xpath\",\n", + " '(.//div[@data-testid=\"tweetText\"])[1]/img[contains(@src, \"emoji\")]',\n", + " )\n", + " \n", + " # self.emojis = [emoji.get_attribute(\"alt\").encode(\"utf-8\") for emoji in raw_emojis]\n", + " self.emojis = [emoji.get_attribute(\"alt\").encode(\"unicode-escape\").decode(\"ASCII\") for emoji in raw_emojis]\n", + " except NoSuchElementException:\n", + " self.emojis = []\n", + " \n", + " try:\n", + " self.profile_img = card.find_element(\n", + " \"xpath\", './/div[@data-testid=\"Tweet-User-Avatar\"]//img'\n", + " ).get_attribute(\"src\")\n", + " except NoSuchElementException:\n", + " self.profile_img = \"\"\n", "\n", " self.tweet = (\n", " self.user,\n", @@ -246,6 +267,8 @@ " self.like_cnt,\n", " self.analytics_cnt,\n", " self.tags,\n", + " self.mentions,\n", + " self.emojis,\n", " self.profile_img,\n", " )\n", "\n", @@ -264,7 +287,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 144, "metadata": {}, "outputs": [], "source": [ @@ -334,6 +357,7 @@ " \"tab\": \"Latest\" if scrape_latest else \"Top\" if scrape_top else \"Latest\",\n", " }\n", " self.router = self.go_to_home\n", + " self.scroller = Scroller(self.driver)\n", "\n", " if scrape_username is not None:\n", " self.scraper_details[\"type\"] = \"Username\"\n", @@ -708,14 +732,16 @@ " \"Likes\": [tweet[7] for tweet in self.data],\n", " \"Analytics\": [tweet[8] for tweet in self.data],\n", " \"Tags\": [tweet[9] for tweet in self.data],\n", - " \"Profile Image\": [tweet[10] for tweet in self.data],\n", + " \"Mentions\": [tweet[10] for tweet in self.data],\n", + " \"Emojis\": [tweet[11] for tweet in self.data],\n", + " \"Profile Image\": [tweet[12] for tweet in self.data],\n", " }\n", "\n", " df = pd.DataFrame(data)\n", "\n", " current_time = now.strftime(\"%Y-%m-%d_%H-%M-%S\")\n", " file_path = f\"{folder_path}{current_time}_tweets_1-{len(self.data)}.csv\"\n", - " df.to_csv(file_path, index=False)\n", + " df.to_csv(file_path, index=False, encoding=\"utf-8\")\n", "\n", " print(\"CSV Saved: {}\".format(file_path))\n", "\n", @@ -735,7 +761,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 145, "metadata": {}, "outputs": [ { @@ -760,7 +786,7 @@ " username=USER_UNAME,\n", " password=USER_PASSWORD,\n", " # max_tweets=10,\n", - " # scrape_username=\"\",\n", + " # scrape_username=\"something\",\n", " # scrape_hashtag=\"something\",\n", " # scrape_query=\"something\",\n", " # scrape_latest=True,\n", @@ -778,7 +804,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 146, "metadata": {}, "outputs": [ { @@ -798,6 +824,7 @@ " # max_tweets=10,\n", " # scrape_username=\"something\",\n", " # scrape_hashtag=\"something\",\n", + " # scrape_hashtag=\"something\",\n", " # scrape_query=\"something\",\n", " # scrape_latest=True,\n", " # scrape_top=False,\n", @@ -814,7 +841,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 147, "metadata": {}, "outputs": [ { @@ -822,7 +849,7 @@ "output_type": "stream", "text": [ "Saving Tweets to CSV...\n", - "CSV Saved: ./tweets/2023-09-20_09-26-30_tweets_1-50.csv\n" + "CSV Saved: ./tweets/2023-09-23_09-54-41_tweets_1-50.csv\n" ] } ], @@ -832,7 +859,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 148, "metadata": {}, "outputs": [], "source": [ diff --git a/scraper/tweet.py b/scraper/tweet.py index d507ee1..6d30b87 100644 --- a/scraper/tweet.py +++ b/scraper/tweet.py @@ -77,13 +77,6 @@ class Tweet: except NoSuchElementException: self.analytics_cnt = "0" - try: - self.profile_img = card.find_element( - "xpath", './/div[@data-testid="Tweet-User-Avatar"]//img' - ).get_attribute("src") - except NoSuchElementException: - self.profile_img = "" - try: self.tags = card.find_elements( "xpath", @@ -94,6 +87,36 @@ class Tweet: except NoSuchElementException: self.tags = [] + try: + self.mentions = card.find_elements( + "xpath", + '(.//div[@data-testid="tweetText"])[1]//a[contains(text(), "@")]', + ) + + self.mentions = [mention.text for mention in self.mentions] + except NoSuchElementException: + self.mentions = [] + + try: + raw_emojis = card.find_elements( + "xpath", + '(.//div[@data-testid="tweetText"])[1]/img[contains(@src, "emoji")]', + ) + + self.emojis = [ + emoji.get_attribute("alt").encode("unicode-escape").decode("ASCII") + for emoji in raw_emojis + ] + except NoSuchElementException: + self.emojis = [] + + try: + self.profile_img = card.find_element( + "xpath", './/div[@data-testid="Tweet-User-Avatar"]//img' + ).get_attribute("src") + except NoSuchElementException: + self.profile_img = "" + self.tweet = ( self.user, self.handle, @@ -105,6 +128,8 @@ class Tweet: self.like_cnt, self.analytics_cnt, self.tags, + self.mentions, + self.emojis, self.profile_img, ) diff --git a/scraper/twitter_scraper.py b/scraper/twitter_scraper.py index c1fdc97..da14b38 100644 --- a/scraper/twitter_scraper.py +++ b/scraper/twitter_scraper.py @@ -88,6 +88,7 @@ class Twitter_Scraper: "tab": "Latest" if scrape_latest else "Top" if scrape_top else "Latest", } self.router = self.go_to_home + self.scroller = Scroller(self.driver) if scrape_username is not None: self.scraper_details["type"] = "Username" @@ -462,14 +463,16 @@ It may be due to the following: "Likes": [tweet[7] for tweet in self.data], "Analytics": [tweet[8] for tweet in self.data], "Tags": [tweet[9] for tweet in self.data], - "Profile Image": [tweet[10] for tweet in self.data], + "Mentions": [tweet[10] for tweet in self.data], + "Emojis": [tweet[11] for tweet in self.data], + "Profile Image": [tweet[12] for tweet in self.data], } df = pd.DataFrame(data) current_time = now.strftime("%Y-%m-%d_%H-%M-%S") file_path = f"{folder_path}{current_time}_tweets_1-{len(self.data)}.csv" - df.to_csv(file_path, index=False) + df.to_csv(file_path, index=False, encoding="utf-8") print("CSV Saved: {}".format(file_path))