feat(ipynb): Add tweet link, tweet id, and user id
This commit is contained in:
116
main.ipynb
116
main.ipynb
@@ -17,7 +17,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -57,7 +57,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -97,7 +97,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -141,7 +141,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -282,9 +282,20 @@
|
||||
" ).get_attribute(\"src\")\n",
|
||||
" except NoSuchElementException:\n",
|
||||
" self.profile_img = \"\"\n",
|
||||
" \n",
|
||||
" try:\n",
|
||||
" self.tweet_link = self.card.find_element(\n",
|
||||
" \"xpath\",\n",
|
||||
" \".//a[contains(@href, '/status/')]\",\n",
|
||||
" ).get_attribute(\"href\")\n",
|
||||
" self.tweet_id = str(self.tweet_link.split(\"/\")[-1])\n",
|
||||
" except NoSuchElementException:\n",
|
||||
" self.tweet_link = \"\"\n",
|
||||
" self.tweet_id = \"\"\n",
|
||||
" \n",
|
||||
" self.following_cnt = \"0\"\n",
|
||||
" self.followers_cnt = \"0\"\n",
|
||||
" self.user_id = None\n",
|
||||
" \n",
|
||||
" if scrape_poster_details:\n",
|
||||
" el_name = card.find_element(\n",
|
||||
@@ -292,11 +303,12 @@
|
||||
" )\n",
|
||||
" \n",
|
||||
" ext_hover_card = False\n",
|
||||
" ext_user_id = False\n",
|
||||
" ext_following = False\n",
|
||||
" ext_followers = False\n",
|
||||
" hover_attempt = 0\n",
|
||||
" \n",
|
||||
" while not ext_hover_card or not ext_following or not ext_followers:\n",
|
||||
" while not ext_hover_card or not ext_user_id or not ext_following or not ext_followers:\n",
|
||||
" try:\n",
|
||||
" actions.move_to_element(el_name).perform()\n",
|
||||
" \n",
|
||||
@@ -307,6 +319,25 @@
|
||||
" \n",
|
||||
" ext_hover_card = True\n",
|
||||
" \n",
|
||||
" while not ext_user_id:\n",
|
||||
" try:\n",
|
||||
" raw_user_id = hover_card.find_element(\n",
|
||||
" \"xpath\",\n",
|
||||
" '(.//div[contains(@data-testid, \"-follow\")]) | (.//div[contains(@data-testid, \"-unfollow\")])'\n",
|
||||
" ).get_attribute(\"data-testid\")\n",
|
||||
" \n",
|
||||
" if raw_user_id == \"\":\n",
|
||||
" self.user_id = None\n",
|
||||
" else:\n",
|
||||
" self.user_id = str(raw_user_id.split(\"-\")[0])\n",
|
||||
" \n",
|
||||
" ext_user_id = True\n",
|
||||
" except NoSuchElementException:\n",
|
||||
" continue\n",
|
||||
" except StaleElementReferenceException:\n",
|
||||
" self.error = True\n",
|
||||
" return\n",
|
||||
" \n",
|
||||
" while not ext_following:\n",
|
||||
" try:\n",
|
||||
" self.following_cnt = hover_card.find_element(\n",
|
||||
@@ -368,6 +399,9 @@
|
||||
" self.mentions,\n",
|
||||
" self.emojis,\n",
|
||||
" self.profile_img,\n",
|
||||
" self.tweet_link,\n",
|
||||
" self.tweet_id,\n",
|
||||
" self.user_id,\n",
|
||||
" self.following_cnt,\n",
|
||||
" self.followers_cnt,\n",
|
||||
" )\n",
|
||||
@@ -387,7 +421,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -870,16 +904,20 @@
|
||||
" \"Mentions\": [tweet[10] for tweet in self.data],\n",
|
||||
" \"Emojis\": [tweet[11] for tweet in self.data],\n",
|
||||
" \"Profile Image\": [tweet[12] for tweet in self.data],\n",
|
||||
" \"Tweet Link\": [tweet[13] for tweet in self.data],\n",
|
||||
" \"Tweet ID\": [f'tweet_id:{tweet[14]}' for tweet in self.data],\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" if self.scraper_details[\"poster_details\"]:\n",
|
||||
" data[\"Following\"] = [tweet[13] for tweet in self.data]\n",
|
||||
" data[\"Followers\"] = [tweet[14] for tweet in self.data]\n",
|
||||
" data[\"Tweeter ID\"] = [f'user_id:{tweet[15]}' for tweet in self.data]\n",
|
||||
" data[\"Following\"] = [tweet[16] for tweet in self.data]\n",
|
||||
" data[\"Followers\"] = [tweet[17] for tweet in self.data]\n",
|
||||
"\n",
|
||||
" df = pd.DataFrame(data)\n",
|
||||
"\n",
|
||||
" current_time = now.strftime(\"%Y-%m-%d_%H-%M-%S\")\n",
|
||||
" file_path = f\"{folder_path}{current_time}_tweets_1-{len(self.data)}.csv\"\n",
|
||||
" pd.set_option(\"display.max_colwidth\", None)\n",
|
||||
" df.to_csv(file_path, index=False, encoding=\"utf-8\")\n",
|
||||
"\n",
|
||||
" print(\"CSV Saved: {}\".format(file_path))\n",
|
||||
@@ -900,20 +938,9 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Initializing Twitter Scraper...\n",
|
||||
"Setup WebDriver...\n",
|
||||
"Initializing ChromeDriver...\n",
|
||||
"WebDriver Setup Complete\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"USER_UNAME = os.environ['TWITTER_USERNAME']\n",
|
||||
"USER_PASSWORD = os.environ['TWITTER_PASSWORD']\n",
|
||||
@@ -933,21 +960,9 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"Logging in to Twitter...\n",
|
||||
"\n",
|
||||
"Login Successful\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"scraper.login()"
|
||||
]
|
||||
@@ -962,21 +977,9 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Scraping Tweets from Home...\n",
|
||||
"Progress: [[========================================]] 100.00% 50 of 50\n",
|
||||
"Scraping Complete\n",
|
||||
"Tweets: 50 out of 50\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"scraper.scrape_tweets(\n",
|
||||
" # max_tweets=100,\n",
|
||||
@@ -999,25 +1002,16 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Saving Tweets to CSV...\n",
|
||||
"CSV Saved: ./tweets/2023-09-25_08-20-51_tweets_1-50.csv\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"scraper.save_to_csv()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
||||
Reference in New Issue
Block a user