feat(ipynb): Add tweet link, tweet id, and user id

2023-09-28 11:05:33 +08:00
parent 5c98c3ebc4
commit e3960dedcc
1 changed files with 55 additions and 61 deletions
@@ -17,7 +17,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -57,7 +57,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -97,7 +97,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -141,7 +141,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -283,8 +283,19 @@
    "        except NoSuchElementException:\n",
    "            self.profile_img = \"\"\n",
    "            \n",
    "        try:\n",
    "            self.tweet_link = self.card.find_element(\n",
    "                \"xpath\",\n",
    "                \".//a[contains(@href, '/status/')]\",\n",
    "            ).get_attribute(\"href\")\n",
    "            self.tweet_id = str(self.tweet_link.split(\"/\")[-1])\n",
    "        except NoSuchElementException:\n",
    "            self.tweet_link = \"\"\n",
    "            self.tweet_id = \"\"\n",
    "        \n",
    "        self.following_cnt = \"0\"\n",
    "        self.followers_cnt = \"0\"\n",
    "        self.user_id = None\n",
    "        \n",
    "        if scrape_poster_details:\n",
    "            el_name = card.find_element(\n",
@@ -292,11 +303,12 @@
    "            )\n",
    "            \n",
    "            ext_hover_card = False\n",
    "            ext_user_id = False\n",
    "            ext_following = False\n",
    "            ext_followers = False\n",
    "            hover_attempt = 0\n",
    "            \n",
-    "            while not ext_hover_card or not ext_following or not ext_followers:\n",
+    "            while not ext_hover_card or not ext_user_id or not ext_following or not ext_followers:\n",
    "                try:\n",
    "                    actions.move_to_element(el_name).perform()\n",
    "                    \n",
@@ -307,6 +319,25 @@
    "                    \n",
    "                    ext_hover_card = True\n",
    "                    \n",
    "                    while not ext_user_id:\n",
    "                        try:\n",
    "                            raw_user_id = hover_card.find_element(\n",
    "                                \"xpath\",\n",
    "                                '(.//div[contains(@data-testid, \"-follow\")]) | (.//div[contains(@data-testid, \"-unfollow\")])'\n",
    "                            ).get_attribute(\"data-testid\")\n",
    "                            \n",
    "                            if raw_user_id == \"\":\n",
    "                                self.user_id = None\n",
    "                            else:\n",
    "                                self.user_id = str(raw_user_id.split(\"-\")[0])\n",
    "                            \n",
    "                            ext_user_id = True\n",
    "                        except NoSuchElementException:\n",
    "                            continue\n",
    "                        except StaleElementReferenceException:\n",
    "                            self.error = True\n",
    "                            return\n",
    "                    \n",
    "                    while not ext_following:\n",
    "                        try:\n",
    "                            self.following_cnt = hover_card.find_element(\n",
@@ -368,6 +399,9 @@
    "            self.mentions,\n",
    "            self.emojis,\n",
    "            self.profile_img,\n",
    "            self.tweet_link,\n",
    "            self.tweet_id,\n",
    "            self.user_id,\n",
    "            self.following_cnt,\n",
    "            self.followers_cnt,\n",
    "        )\n",
@@ -387,7 +421,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -870,16 +904,20 @@
    "            \"Mentions\": [tweet[10] for tweet in self.data],\n",
    "            \"Emojis\": [tweet[11] for tweet in self.data],\n",
    "            \"Profile Image\": [tweet[12] for tweet in self.data],\n",
    "            \"Tweet Link\": [tweet[13] for tweet in self.data],\n",
    "            \"Tweet ID\": [f'tweet_id:{tweet[14]}' for tweet in self.data],\n",
    "        }\n",
    "\n",
    "        if self.scraper_details[\"poster_details\"]:\n",
-    "            data[\"Following\"] = [tweet[13] for tweet in self.data]\n",
+    "            data[\"Tweeter ID\"] = [f'user_id:{tweet[15]}' for tweet in self.data]\n",
-    "            data[\"Followers\"] = [tweet[14] for tweet in self.data]\n",
+    "            data[\"Following\"] = [tweet[16] for tweet in self.data]\n",
    "            data[\"Followers\"] = [tweet[17] for tweet in self.data]\n",
    "\n",
    "        df = pd.DataFrame(data)\n",
    "\n",
    "        current_time = now.strftime(\"%Y-%m-%d_%H-%M-%S\")\n",
    "        file_path = f\"{folder_path}{current_time}_tweets_1-{len(self.data)}.csv\"\n",
    "        pd.set_option(\"display.max_colwidth\", None)\n",
    "        df.to_csv(file_path, index=False, encoding=\"utf-8\")\n",
    "\n",
    "        print(\"CSV Saved: {}\".format(file_path))\n",
@@ -900,20 +938,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
+   "outputs": [],
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Initializing Twitter Scraper...\n",
      "Setup WebDriver...\n",
      "Initializing ChromeDriver...\n",
      "WebDriver Setup Complete\n"
     ]
    }
   ],
   "source": [
    "USER_UNAME = os.environ['TWITTER_USERNAME']\n",
    "USER_PASSWORD = os.environ['TWITTER_PASSWORD']\n",
@@ -933,21 +960,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
+   "outputs": [],
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Logging in to Twitter...\n",
      "\n",
      "Login Successful\n",
      "\n"
     ]
    }
   ],
   "source": [
    "scraper.login()"
   ]
@@ -962,21 +977,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
+   "outputs": [],
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Scraping Tweets from Home...\n",
      "Progress: [[========================================]] 100.00% 50 of 50\n",
      "Scraping Complete\n",
      "Tweets: 50 out of 50\n",
      "\n"
     ]
    }
   ],
   "source": [
    "scraper.scrape_tweets(\n",
    "    # max_tweets=100,\n",
@@ -999,25 +1002,16 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
+   "outputs": [],
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saving Tweets to CSV...\n",
      "CSV Saved: ./tweets/2023-09-25_08-20-51_tweets_1-50.csv\n"
     ]
    }
   ],
   "source": [
    "scraper.save_to_csv()"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [