feat(ipynb): Add tweet link, tweet id, and user id

This commit is contained in:
Jarrian
2023-09-28 11:05:33 +08:00
parent 5c98c3ebc4
commit e3960dedcc

View File

@@ -17,7 +17,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -57,7 +57,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -97,7 +97,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -141,7 +141,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -283,8 +283,19 @@
" except NoSuchElementException:\n", " except NoSuchElementException:\n",
" self.profile_img = \"\"\n", " self.profile_img = \"\"\n",
" \n", " \n",
" try:\n",
" self.tweet_link = self.card.find_element(\n",
" \"xpath\",\n",
" \".//a[contains(@href, '/status/')]\",\n",
" ).get_attribute(\"href\")\n",
" self.tweet_id = str(self.tweet_link.split(\"/\")[-1])\n",
" except NoSuchElementException:\n",
" self.tweet_link = \"\"\n",
" self.tweet_id = \"\"\n",
" \n",
" self.following_cnt = \"0\"\n", " self.following_cnt = \"0\"\n",
" self.followers_cnt = \"0\"\n", " self.followers_cnt = \"0\"\n",
" self.user_id = None\n",
" \n", " \n",
" if scrape_poster_details:\n", " if scrape_poster_details:\n",
" el_name = card.find_element(\n", " el_name = card.find_element(\n",
@@ -292,11 +303,12 @@
" )\n", " )\n",
" \n", " \n",
" ext_hover_card = False\n", " ext_hover_card = False\n",
" ext_user_id = False\n",
" ext_following = False\n", " ext_following = False\n",
" ext_followers = False\n", " ext_followers = False\n",
" hover_attempt = 0\n", " hover_attempt = 0\n",
" \n", " \n",
" while not ext_hover_card or not ext_following or not ext_followers:\n", " while not ext_hover_card or not ext_user_id or not ext_following or not ext_followers:\n",
" try:\n", " try:\n",
" actions.move_to_element(el_name).perform()\n", " actions.move_to_element(el_name).perform()\n",
" \n", " \n",
@@ -307,6 +319,25 @@
" \n", " \n",
" ext_hover_card = True\n", " ext_hover_card = True\n",
" \n", " \n",
" while not ext_user_id:\n",
" try:\n",
" raw_user_id = hover_card.find_element(\n",
" \"xpath\",\n",
" '(.//div[contains(@data-testid, \"-follow\")]) | (.//div[contains(@data-testid, \"-unfollow\")])'\n",
" ).get_attribute(\"data-testid\")\n",
" \n",
" if raw_user_id == \"\":\n",
" self.user_id = None\n",
" else:\n",
" self.user_id = str(raw_user_id.split(\"-\")[0])\n",
" \n",
" ext_user_id = True\n",
" except NoSuchElementException:\n",
" continue\n",
" except StaleElementReferenceException:\n",
" self.error = True\n",
" return\n",
" \n",
" while not ext_following:\n", " while not ext_following:\n",
" try:\n", " try:\n",
" self.following_cnt = hover_card.find_element(\n", " self.following_cnt = hover_card.find_element(\n",
@@ -368,6 +399,9 @@
" self.mentions,\n", " self.mentions,\n",
" self.emojis,\n", " self.emojis,\n",
" self.profile_img,\n", " self.profile_img,\n",
" self.tweet_link,\n",
" self.tweet_id,\n",
" self.user_id,\n",
" self.following_cnt,\n", " self.following_cnt,\n",
" self.followers_cnt,\n", " self.followers_cnt,\n",
" )\n", " )\n",
@@ -387,7 +421,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -870,16 +904,20 @@
" \"Mentions\": [tweet[10] for tweet in self.data],\n", " \"Mentions\": [tweet[10] for tweet in self.data],\n",
" \"Emojis\": [tweet[11] for tweet in self.data],\n", " \"Emojis\": [tweet[11] for tweet in self.data],\n",
" \"Profile Image\": [tweet[12] for tweet in self.data],\n", " \"Profile Image\": [tweet[12] for tweet in self.data],\n",
" \"Tweet Link\": [tweet[13] for tweet in self.data],\n",
" \"Tweet ID\": [f'tweet_id:{tweet[14]}' for tweet in self.data],\n",
" }\n", " }\n",
"\n", "\n",
" if self.scraper_details[\"poster_details\"]:\n", " if self.scraper_details[\"poster_details\"]:\n",
" data[\"Following\"] = [tweet[13] for tweet in self.data]\n", " data[\"Tweeter ID\"] = [f'user_id:{tweet[15]}' for tweet in self.data]\n",
" data[\"Followers\"] = [tweet[14] for tweet in self.data]\n", " data[\"Following\"] = [tweet[16] for tweet in self.data]\n",
" data[\"Followers\"] = [tweet[17] for tweet in self.data]\n",
"\n", "\n",
" df = pd.DataFrame(data)\n", " df = pd.DataFrame(data)\n",
"\n", "\n",
" current_time = now.strftime(\"%Y-%m-%d_%H-%M-%S\")\n", " current_time = now.strftime(\"%Y-%m-%d_%H-%M-%S\")\n",
" file_path = f\"{folder_path}{current_time}_tweets_1-{len(self.data)}.csv\"\n", " file_path = f\"{folder_path}{current_time}_tweets_1-{len(self.data)}.csv\"\n",
" pd.set_option(\"display.max_colwidth\", None)\n",
" df.to_csv(file_path, index=False, encoding=\"utf-8\")\n", " df.to_csv(file_path, index=False, encoding=\"utf-8\")\n",
"\n", "\n",
" print(\"CSV Saved: {}\".format(file_path))\n", " print(\"CSV Saved: {}\".format(file_path))\n",
@@ -900,20 +938,9 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"name": "stdout",
"output_type": "stream",
"text": [
"Initializing Twitter Scraper...\n",
"Setup WebDriver...\n",
"Initializing ChromeDriver...\n",
"WebDriver Setup Complete\n"
]
}
],
"source": [ "source": [
"USER_UNAME = os.environ['TWITTER_USERNAME']\n", "USER_UNAME = os.environ['TWITTER_USERNAME']\n",
"USER_PASSWORD = os.environ['TWITTER_PASSWORD']\n", "USER_PASSWORD = os.environ['TWITTER_PASSWORD']\n",
@@ -933,21 +960,9 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Logging in to Twitter...\n",
"\n",
"Login Successful\n",
"\n"
]
}
],
"source": [ "source": [
"scraper.login()" "scraper.login()"
] ]
@@ -962,21 +977,9 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"name": "stdout",
"output_type": "stream",
"text": [
"Scraping Tweets from Home...\n",
"Progress: [[========================================]] 100.00% 50 of 50\n",
"Scraping Complete\n",
"Tweets: 50 out of 50\n",
"\n"
]
}
],
"source": [ "source": [
"scraper.scrape_tweets(\n", "scraper.scrape_tweets(\n",
" # max_tweets=100,\n", " # max_tweets=100,\n",
@@ -999,25 +1002,16 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 9, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"name": "stdout",
"output_type": "stream",
"text": [
"Saving Tweets to CSV...\n",
"CSV Saved: ./tweets/2023-09-25_08-20-51_tweets_1-50.csv\n"
]
}
],
"source": [ "source": [
"scraper.save_to_csv()" "scraper.save_to_csv()"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [