diff --git a/main.ipynb b/main.ipynb index 4895850..dc4233d 100644 --- a/main.ipynb +++ b/main.ipynb @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -57,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -97,7 +97,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -141,7 +141,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -282,9 +282,20 @@ " ).get_attribute(\"src\")\n", " except NoSuchElementException:\n", " self.profile_img = \"\"\n", + " \n", + " try:\n", + " self.tweet_link = self.card.find_element(\n", + " \"xpath\",\n", + " \".//a[contains(@href, '/status/')]\",\n", + " ).get_attribute(\"href\")\n", + " self.tweet_id = str(self.tweet_link.split(\"/\")[-1])\n", + " except NoSuchElementException:\n", + " self.tweet_link = \"\"\n", + " self.tweet_id = \"\"\n", " \n", " self.following_cnt = \"0\"\n", " self.followers_cnt = \"0\"\n", + " self.user_id = None\n", " \n", " if scrape_poster_details:\n", " el_name = card.find_element(\n", @@ -292,11 +303,12 @@ " )\n", " \n", " ext_hover_card = False\n", + " ext_user_id = False\n", " ext_following = False\n", " ext_followers = False\n", " hover_attempt = 0\n", " \n", - " while not ext_hover_card or not ext_following or not ext_followers:\n", + " while not ext_hover_card or not ext_user_id or not ext_following or not ext_followers:\n", " try:\n", " actions.move_to_element(el_name).perform()\n", " \n", @@ -307,6 +319,25 @@ " \n", " ext_hover_card = True\n", " \n", + " while not ext_user_id:\n", + " try:\n", + " raw_user_id = hover_card.find_element(\n", + " \"xpath\",\n", + " '(.//div[contains(@data-testid, \"-follow\")]) | (.//div[contains(@data-testid, \"-unfollow\")])'\n", + " ).get_attribute(\"data-testid\")\n", + " \n", + " if raw_user_id == \"\":\n", + " self.user_id = None\n", + " else:\n", + " self.user_id = str(raw_user_id.split(\"-\")[0])\n", + " \n", + " ext_user_id = True\n", + " except NoSuchElementException:\n", + " continue\n", + " except StaleElementReferenceException:\n", + " self.error = True\n", + " return\n", + " \n", " while not ext_following:\n", " try:\n", " self.following_cnt = hover_card.find_element(\n", @@ -368,6 +399,9 @@ " self.mentions,\n", " self.emojis,\n", " self.profile_img,\n", + " self.tweet_link,\n", + " self.tweet_id,\n", + " self.user_id,\n", " self.following_cnt,\n", " self.followers_cnt,\n", " )\n", @@ -387,7 +421,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -870,16 +904,20 @@ " \"Mentions\": [tweet[10] for tweet in self.data],\n", " \"Emojis\": [tweet[11] for tweet in self.data],\n", " \"Profile Image\": [tweet[12] for tweet in self.data],\n", + " \"Tweet Link\": [tweet[13] for tweet in self.data],\n", + " \"Tweet ID\": [f'tweet_id:{tweet[14]}' for tweet in self.data],\n", " }\n", "\n", " if self.scraper_details[\"poster_details\"]:\n", - " data[\"Following\"] = [tweet[13] for tweet in self.data]\n", - " data[\"Followers\"] = [tweet[14] for tweet in self.data]\n", + " data[\"Tweeter ID\"] = [f'user_id:{tweet[15]}' for tweet in self.data]\n", + " data[\"Following\"] = [tweet[16] for tweet in self.data]\n", + " data[\"Followers\"] = [tweet[17] for tweet in self.data]\n", "\n", " df = pd.DataFrame(data)\n", "\n", " current_time = now.strftime(\"%Y-%m-%d_%H-%M-%S\")\n", " file_path = f\"{folder_path}{current_time}_tweets_1-{len(self.data)}.csv\"\n", + " pd.set_option(\"display.max_colwidth\", None)\n", " df.to_csv(file_path, index=False, encoding=\"utf-8\")\n", "\n", " print(\"CSV Saved: {}\".format(file_path))\n", @@ -900,20 +938,9 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Initializing Twitter Scraper...\n", - "Setup WebDriver...\n", - "Initializing ChromeDriver...\n", - "WebDriver Setup Complete\n" - ] - } - ], + "outputs": [], "source": [ "USER_UNAME = os.environ['TWITTER_USERNAME']\n", "USER_PASSWORD = os.environ['TWITTER_PASSWORD']\n", @@ -933,21 +960,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Logging in to Twitter...\n", - "\n", - "Login Successful\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "scraper.login()" ] @@ -962,21 +977,9 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Scraping Tweets from Home...\n", - "Progress: [[========================================]] 100.00% 50 of 50\n", - "Scraping Complete\n", - "Tweets: 50 out of 50\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "scraper.scrape_tweets(\n", " # max_tweets=100,\n", @@ -999,25 +1002,16 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Saving Tweets to CSV...\n", - "CSV Saved: ./tweets/2023-09-25_08-20-51_tweets_1-50.csv\n" - ] - } - ], + "outputs": [], "source": [ "scraper.save_to_csv()" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, "outputs": [], "source": [