feat: Jupyter scrape followers and following count

This commit is contained in:
Jarrian
2023-09-24 23:54:12 +08:00
parent 6f36bc33ca
commit da77993c12

View File

@@ -17,7 +17,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 140, "execution_count": 94,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -36,7 +36,9 @@
" StaleElementReferenceException,\n", " StaleElementReferenceException,\n",
" WebDriverException,\n", " WebDriverException,\n",
")\n", ")\n",
"from selenium.webdriver.common.action_chains import ActionChains\n",
"\n", "\n",
"from selenium.webdriver.chrome.webdriver import WebDriver\n",
"from selenium.webdriver.chrome.options import Options as ChromeOptions\n", "from selenium.webdriver.chrome.options import Options as ChromeOptions\n",
"from selenium.webdriver.chrome.service import Service as ChromeService\n", "from selenium.webdriver.chrome.service import Service as ChromeService\n",
"\n", "\n",
@@ -55,7 +57,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 141, "execution_count": 95,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -95,7 +97,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 142, "execution_count": 96,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -139,27 +141,37 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 143, "execution_count": 97,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"class Tweet:\n", "class Tweet:\n",
" def __init__(self, card: Chrome) -> None:\n", " def __init__(\n",
" self,\n",
" card: WebDriver,\n",
" driver: WebDriver,\n",
" actions: ActionChains,\n",
" scrape_poster_details=False\n",
" ) -> None:\n",
" self.card = card\n", " self.card = card\n",
" self.error = False\n",
" self.tweet = None\n",
"\n", "\n",
" try:\n", " try:\n",
" self.user = card.find_element(\n", " self.user = card.find_element(\n",
" \"xpath\", './/div[@data-testid=\"User-Name\"]//span'\n", " \"xpath\", './/div[@data-testid=\"User-Name\"]//span'\n",
" ).text\n", " ).text\n",
" except NoSuchElementException:\n", " except NoSuchElementException:\n",
" return\n", " self.error = True\n",
" self.user = \"skip\"\n",
"\n", "\n",
" try:\n", " try:\n",
" self.handle = card.find_element(\n", " self.handle = card.find_element(\n",
" \"xpath\", './/span[contains(text(), \"@\")]'\n", " \"xpath\", './/span[contains(text(), \"@\")]'\n",
" ).text\n", " ).text\n",
" except NoSuchElementException:\n", " except NoSuchElementException:\n",
" return\n", " self.error = True\n",
" self.handle = \"skip\"\n",
"\n", "\n",
" try:\n", " try:\n",
" self.date_time = card.find_element(\"xpath\", \".//time\").get_attribute(\n", " self.date_time = card.find_element(\"xpath\", \".//time\").get_attribute(\n",
@@ -170,6 +182,10 @@
" self.is_ad = False\n", " self.is_ad = False\n",
" except NoSuchElementException:\n", " except NoSuchElementException:\n",
" self.is_ad = True\n", " self.is_ad = True\n",
" self.error = True\n",
" self.date_time = \"skip\"\n",
" \n",
" if self.error:\n",
" return\n", " return\n",
"\n", "\n",
" try:\n", " try:\n",
@@ -267,7 +283,78 @@
" ).get_attribute(\"src\")\n", " ).get_attribute(\"src\")\n",
" except NoSuchElementException:\n", " except NoSuchElementException:\n",
" self.profile_img = \"\"\n", " self.profile_img = \"\"\n",
"\n", " \n",
" self.following_cnt = \"0\"\n",
" self.followers_cnt = \"0\"\n",
" \n",
" if scrape_poster_details:\n",
" el_name = card.find_element(\n",
" \"xpath\", './/div[@data-testid=\"User-Name\"]//span'\n",
" )\n",
" \n",
" ext_hover_card = False\n",
" ext_following = False\n",
" ext_followers = False\n",
" hover_attempt = 0\n",
" \n",
" while not ext_hover_card or not ext_following or not ext_followers:\n",
" try:\n",
" actions.move_to_element(el_name).perform()\n",
" \n",
" hover_card = driver.find_element(\n",
" \"xpath\",\n",
" '//div[@data-testid=\"hoverCardParent\"]'\n",
" )\n",
" \n",
" ext_hover_card = True\n",
" \n",
" while not ext_following:\n",
" try:\n",
" self.following_cnt = hover_card.find_element(\n",
" \"xpath\",\n",
" './/a[contains(@href, \"/following\")]//span'\n",
" ).text\n",
" \n",
" if self.following_cnt == \"\":\n",
" self.following_cnt = \"0\"\n",
" \n",
" ext_following = True\n",
" except NoSuchElementException:\n",
" continue\n",
" except StaleElementReferenceException:\n",
" self.error = True\n",
" return\n",
" \n",
" while not ext_followers:\n",
" try:\n",
" self.followers_cnt = hover_card.find_element(\n",
" \"xpath\",\n",
" './/a[contains(@href, \"/verified_followers\")]//span'\n",
" ).text\n",
" \n",
" if self.followers_cnt == \"\":\n",
" self.followers_cnt = \"0\"\n",
" \n",
" ext_followers = True\n",
" except NoSuchElementException:\n",
" continue\n",
" except StaleElementReferenceException:\n",
" self.error = True\n",
" return\n",
" except NoSuchElementException:\n",
" if hover_attempt==3:\n",
" self.error\n",
" return\n",
" hover_attempt+=1\n",
" sleep(0.5)\n",
" continue\n",
" except StaleElementReferenceException:\n",
" self.error = True\n",
" return\n",
" \n",
" if ext_hover_card and ext_following and ext_followers:\n",
" actions.reset_actions()\n",
" \n",
" self.tweet = (\n", " self.tweet = (\n",
" self.user,\n", " self.user,\n",
" self.handle,\n", " self.handle,\n",
@@ -282,6 +369,8 @@
" self.mentions,\n", " self.mentions,\n",
" self.emojis,\n", " self.emojis,\n",
" self.profile_img,\n", " self.profile_img,\n",
" self.following_cnt,\n",
" self.followers_cnt,\n",
" )\n", " )\n",
"\n", "\n",
" pass\n" " pass\n"
@@ -299,12 +388,13 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 144, "execution_count": 98,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"TWITTER_LOGIN_URL = \"https://twitter.com/i/flow/login\"\n", "TWITTER_LOGIN_URL = \"https://twitter.com/i/flow/login\"\n",
"\n", "\n",
"\n",
"class Twitter_Scraper:\n", "class Twitter_Scraper:\n",
" def __init__(\n", " def __init__(\n",
" self,\n", " self,\n",
@@ -314,6 +404,7 @@
" scrape_username=None,\n", " scrape_username=None,\n",
" scrape_hashtag=None,\n", " scrape_hashtag=None,\n",
" scrape_query=None,\n", " scrape_query=None,\n",
" scrape_poster_details=False,\n",
" scrape_latest=True,\n", " scrape_latest=True,\n",
" scrape_top=False,\n", " scrape_top=False,\n",
" ):\n", " ):\n",
@@ -329,13 +420,14 @@
" \"hashtag\": None,\n", " \"hashtag\": None,\n",
" \"query\": None,\n", " \"query\": None,\n",
" \"tab\": None,\n", " \"tab\": None,\n",
" \"poster_details\": False,\n",
" }\n", " }\n",
" self.max_tweets = max_tweets\n", " self.max_tweets = max_tweets\n",
" self.progress = Progress(0, max_tweets)\n", " self.progress = Progress(0, max_tweets)\n",
" self.router = self.go_to_home\n", " self.router = self.go_to_home\n",
" self.driver = self._get_driver()\n", " self.driver = self._get_driver()\n",
" self.actions = ActionChains(self.driver)\n",
" self.scroller = Scroller(self.driver)\n", " self.scroller = Scroller(self.driver)\n",
" self._login()\n",
" self._config_scraper(\n", " self._config_scraper(\n",
" max_tweets,\n", " max_tweets,\n",
" scrape_username,\n", " scrape_username,\n",
@@ -343,6 +435,7 @@
" scrape_query,\n", " scrape_query,\n",
" scrape_latest,\n", " scrape_latest,\n",
" scrape_top,\n", " scrape_top,\n",
" scrape_poster_details,\n",
" )\n", " )\n",
"\n", "\n",
" def _config_scraper(\n", " def _config_scraper(\n",
@@ -353,6 +446,7 @@
" scrape_query=None,\n", " scrape_query=None,\n",
" scrape_latest=True,\n", " scrape_latest=True,\n",
" scrape_top=False,\n", " scrape_top=False,\n",
" scrape_poster_details=False,\n",
" ):\n", " ):\n",
" self.tweet_ids = set()\n", " self.tweet_ids = set()\n",
" self.data = []\n", " self.data = []\n",
@@ -367,6 +461,7 @@
" else None,\n", " else None,\n",
" \"query\": scrape_query,\n", " \"query\": scrape_query,\n",
" \"tab\": \"Latest\" if scrape_latest else \"Top\" if scrape_top else \"Latest\",\n", " \"tab\": \"Latest\" if scrape_latest else \"Top\" if scrape_top else \"Latest\",\n",
" \"poster_details\": scrape_poster_details,\n",
" }\n", " }\n",
" self.router = self.go_to_home\n", " self.router = self.go_to_home\n",
" self.scroller = Scroller(self.driver)\n", " self.scroller = Scroller(self.driver)\n",
@@ -408,6 +503,7 @@
" options=browser_option,\n", " options=browser_option,\n",
" )\n", " )\n",
"\n", "\n",
" print(\"WebDriver Setup Complete\")\n",
" return driver\n", " return driver\n",
" except WebDriverException:\n", " except WebDriverException:\n",
" try:\n", " try:\n",
@@ -421,17 +517,20 @@
" options=browser_option,\n", " options=browser_option,\n",
" )\n", " )\n",
"\n", "\n",
" print(\"WebDriver Setup Complete\")\n",
" return driver\n", " return driver\n",
" except Exception as e:\n", " except Exception as e:\n",
" print(f\"Error setting up WebDriver: {e}\")\n", " print(f\"Error setting up WebDriver: {e}\")\n",
" sys.exit(1)\n", " sys.exit(1)\n",
" pass\n",
"\n", "\n",
" def _login(self):\n", " def login(self):\n",
" print()\n",
" print(\"Logging in to Twitter...\")\n", " print(\"Logging in to Twitter...\")\n",
"\n", "\n",
" try:\n", " try:\n",
" self.driver.get(TWITTER_LOGIN_URL)\n",
" self.driver.maximize_window()\n", " self.driver.maximize_window()\n",
" self.driver.get(TWITTER_LOGIN_URL)\n",
" sleep(3)\n", " sleep(3)\n",
"\n", "\n",
" self._input_username()\n", " self._input_username()\n",
@@ -594,10 +693,24 @@
"\n", "\n",
" def get_tweet_cards(self):\n", " def get_tweet_cards(self):\n",
" self.tweet_cards = self.driver.find_elements(\n", " self.tweet_cards = self.driver.find_elements(\n",
" \"xpath\", '//article[@data-testid=\"tweet\"]'\n", " \"xpath\", '//article[@data-testid=\"tweet\" and not(@disabled)]'\n",
" )\n", " )\n",
" pass\n", " pass\n",
"\n", "\n",
" def remove_hidden_cards(self):\n",
" try:\n",
" hidden_cards = self.driver.find_elements(\n",
" \"xpath\", '//article[@data-testid=\"tweet\" and @disabled]'\n",
" )\n",
"\n",
" for card in hidden_cards[1:-2]:\n",
" self.driver.execute_script(\n",
" \"arguments[0].parentNode.parentNode.parentNode.remove();\", card\n",
" )\n",
" except Exception as e:\n",
" return\n",
" pass\n",
"\n",
" def scrape_tweets(\n", " def scrape_tweets(\n",
" self,\n", " self,\n",
" max_tweets=50,\n", " max_tweets=50,\n",
@@ -606,6 +719,7 @@
" scrape_query=None,\n", " scrape_query=None,\n",
" scrape_latest=True,\n", " scrape_latest=True,\n",
" scrape_top=False,\n", " scrape_top=False,\n",
" scrape_poster_details=False,\n",
" router=None,\n", " router=None,\n",
" ):\n", " ):\n",
" self._config_scraper(\n", " self._config_scraper(\n",
@@ -615,6 +729,7 @@
" scrape_query,\n", " scrape_query,\n",
" scrape_latest,\n", " scrape_latest,\n",
" scrape_top,\n", " scrape_top,\n",
" scrape_poster_details,\n",
" )\n", " )\n",
"\n", "\n",
" if router is None:\n", " if router is None:\n",
@@ -645,6 +760,8 @@
"\n", "\n",
" refresh_count = 0\n", " refresh_count = 0\n",
" added_tweets = 0\n", " added_tweets = 0\n",
" empty_count = 0\n",
" # stale_count = 0\n",
"\n", "\n",
" while self.scroller.scrolling:\n", " while self.scroller.scrolling:\n",
" try:\n", " try:\n",
@@ -652,16 +769,45 @@
" added_tweets = 0\n", " added_tweets = 0\n",
"\n", "\n",
" for card in self.tweet_cards[-15:]:\n", " for card in self.tweet_cards[-15:]:\n",
" tweet = Tweet(card)\n",
"\n",
" try:\n", " try:\n",
" tweet_id = f\"{tweet.user}{tweet.handle}{tweet.date_time}\"\n", " tweet_id = str(card)\n",
" except Exception as e:\n", "\n",
" continue\n", " # def hide_card(el):\n",
" # self.driver.execute_script(\n",
" # \"arguments[0].setAttribute('disabled', true);\", el\n",
" # )\n",
"\n",
" # self.driver.execute_script(\n",
" # \"arguments[0].parentElement.parentElement.parentElement.style.display='none';\", el\n",
" # )\n",
"\n", "\n",
" if tweet_id not in self.tweet_ids:\n", " if tweet_id not in self.tweet_ids:\n",
" self.tweet_ids.add(tweet_id)\n", " self.tweet_ids.add(tweet_id)\n",
"\n",
" if not self.scraper_details[\"poster_details\"]:\n",
" self.driver.execute_script(\n",
" \"arguments[0].scrollIntoView();\", card\n",
" )\n",
"\n",
" tweet = Tweet(\n",
" card=card,\n",
" driver=self.driver,\n",
" actions=self.actions,\n",
" scrape_poster_details=self.scraper_details[\n",
" \"poster_details\"\n",
" ],\n",
" )\n",
"\n",
" if tweet:\n", " if tweet:\n",
" # try:\n",
" # tweet_sig = f\"{tweet.user}|{tweet.handle}|{tweet.date_time}|{tweet.is_ad}\"\n",
" # except Exception as e:\n",
" # continue\n",
"\n",
" # if tweet_sig not in self.tweet_ids:\n",
" # self.tweet_ids.add(tweet_sig)\n",
"\n",
" if not tweet.error and tweet.tweet is not None:\n",
" if not tweet.is_ad:\n", " if not tweet.is_ad:\n",
" self.data.append(tweet.tweet)\n", " self.data.append(tweet.tweet)\n",
" added_tweets += 1\n", " added_tweets += 1\n",
@@ -670,44 +816,112 @@
" if len(self.data) >= self.max_tweets:\n", " if len(self.data) >= self.max_tweets:\n",
" self.scroller.scrolling = False\n", " self.scroller.scrolling = False\n",
" break\n", " break\n",
"\n", " else:\n",
" if len(self.data) % 50 == 0:\n", " continue\n",
" sleep(2)\n", " else:\n",
" continue\n",
" # else:\n",
" # continue\n",
" else:\n",
" continue\n",
" else:\n",
" continue\n",
" # hide_card(card)\n",
" except NoSuchElementException:\n",
" continue\n",
"\n", "\n",
" if len(self.data) >= self.max_tweets:\n", " if len(self.data) >= self.max_tweets:\n",
" break\n", " break\n",
"\n", "\n",
" # self.remove_hidden_cards()\n",
"\n",
" # if added_tweets == 0:\n",
" # refresh_count += 1\n",
"\n",
" # if len(self.tweet_cards) > 0:\n",
" # self.driver.execute_script(\n",
" # \"arguments[0].scrollIntoView();\", self.tweet_cards[-1]\n",
" # )\n",
" # sleep(2)\n",
"\n",
" # sleep(1)\n",
"\n",
" # if refresh_count >= 10:\n",
" # print()\n",
" # print(\"No more tweets to scrape\")\n",
" # break\n",
" # else:\n",
" # refresh_count = 0\n",
"\n",
" # if len(self.tweet_cards) == 0:\n",
" # empty_count += 1\n",
" # sleep(1)\n",
"\n",
" # if empty_count >= 3:\n",
" # router()\n",
" # sleep(2)\n",
" # break\n",
" # else:\n",
" # empty_count = 0\n",
"\n",
" # if added_tweets == 0:\n",
" # refresh_count += 1\n",
" # sleep(1)\n",
" # if refresh_count >= 10:\n",
" # print()\n",
" # print(\"No more tweets to scrape\")\n",
" # break\n",
" # else:\n",
" # refresh_count = 0\n",
"\n",
" if added_tweets == 0:\n", " if added_tweets == 0:\n",
" refresh_count += 1\n", " if empty_count >= 5:\n",
" if refresh_count >= 10:\n", " if refresh_count >= 3:\n",
" print()\n", " print()\n",
" print(\"No more tweets to scrape\")\n", " print(\"No more tweets to scrape\")\n",
" break\n", " break\n",
" else:\n", " # router()\n",
" refresh_count = 0\n", " # sleep(2)\n",
"\n", " refresh_count += 1\n",
" self.scroller.scroll_count = 0\n", " empty_count += 1\n",
"\n",
" while True:\n",
" self.scroller.scroll_to_bottom()\n",
" sleep(2)\n",
" self.scroller.update_scroll_position()\n",
"\n",
" if self.scroller.last_position == self.scroller.current_position:\n",
" self.scroller.scroll_count += 1\n",
"\n",
" if self.scroller.scroll_count >= 3:\n",
" router()\n",
" sleep(2)\n",
" break\n",
" else:\n",
" sleep(1)\n", " sleep(1)\n",
" else:\n", " else:\n",
" self.scroller.last_position = self.scroller.current_position\n", " empty_count = 0\n",
" break\n", " refresh_count = 0\n",
"\n",
" # self.scroller.scroll_count = 0\n",
"\n",
" # while True:\n",
" # self.scroller.scroll_to_bottom()\n",
" # sleep(2)\n",
" # self.scroller.update_scroll_position()\n",
"\n",
" # if self.scroller.last_position == self.scroller.current_position:\n",
" # # self.scroller.scroll_count += 1\n",
"\n",
" # # if self.scroller.scroll_count >= 3:\n",
" # # router()\n",
" # # sleep(2)\n",
" # # break\n",
" # # else:\n",
" # # sleep(1)\n",
" # sleep(2)\n",
" # else:\n",
" # self.scroller.last_position = self.scroller.current_position\n",
" # break\n",
" except StaleElementReferenceException:\n", " except StaleElementReferenceException:\n",
" router()\n", " # stale_count += 1\n",
"\n",
" # if stale_count >= 3:\n",
" # router()\n",
" # stale_count = 0\n",
"\n",
" sleep(2)\n", " sleep(2)\n",
" continue\n",
" except KeyboardInterrupt:\n",
" print(\"\\n\")\n",
" print(\"Keyboard Interrupt\")\n",
" break\n",
" except Exception as e:\n", " except Exception as e:\n",
" print(\"\\n\")\n", " print(\"\\n\")\n",
" print(f\"Error scraping tweets: {e}\")\n", " print(f\"Error scraping tweets: {e}\")\n",
@@ -749,6 +963,10 @@
" \"Profile Image\": [tweet[12] for tweet in self.data],\n", " \"Profile Image\": [tweet[12] for tweet in self.data],\n",
" }\n", " }\n",
"\n", "\n",
" if self.scraper_details[\"poster_details\"]:\n",
" data[\"Following\"] = [tweet[13] for tweet in self.data]\n",
" data[\"Followers\"] = [tweet[14] for tweet in self.data]\n",
"\n",
" df = pd.DataFrame(data)\n", " df = pd.DataFrame(data)\n",
"\n", "\n",
" current_time = now.strftime(\"%Y-%m-%d_%H-%M-%S\")\n", " current_time = now.strftime(\"%Y-%m-%d_%H-%M-%S\")\n",
@@ -773,7 +991,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 145, "execution_count": 101,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -783,10 +1001,7 @@
"Initializing Twitter Scraper...\n", "Initializing Twitter Scraper...\n",
"Setup WebDriver...\n", "Setup WebDriver...\n",
"Initializing ChromeDriver...\n", "Initializing ChromeDriver...\n",
"Logging in to Twitter...\n", "WebDriver Setup Complete\n"
"\n",
"Login Successful\n",
"\n"
] ]
} }
], ],
@@ -801,11 +1016,33 @@
" # scrape_username=\"something\",\n", " # scrape_username=\"something\",\n",
" # scrape_hashtag=\"something\",\n", " # scrape_hashtag=\"something\",\n",
" # scrape_query=\"something\",\n", " # scrape_query=\"something\",\n",
" # scrape_latest=True,\n", " # scrape_latest=False,\n",
" # scrape_top=False,\n", " # scrape_top=True,\n",
" # scrape_poster_details=True\n",
")" ")"
] ]
}, },
{
"cell_type": "code",
"execution_count": 102,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Logging in to Twitter...\n",
"\n",
"Login Successful\n",
"\n"
]
}
],
"source": [
"scraper.login()"
]
},
{ {
"attachments": {}, "attachments": {},
"cell_type": "markdown", "cell_type": "markdown",
@@ -816,7 +1053,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 146, "execution_count": 103,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -833,13 +1070,13 @@
], ],
"source": [ "source": [
"scraper.scrape_tweets(\n", "scraper.scrape_tweets(\n",
" # max_tweets=10,\n", " # max_tweets=100,\n",
" # scrape_username=\"something\",\n", " # scrape_username=\"something\",\n",
" # scrape_hashtag=\"something\",\n", " # scrape_hashtag=\"something\",\n",
" # scrape_hashtag=\"something\",\n",
" # scrape_query=\"something\",\n", " # scrape_query=\"something\",\n",
" # scrape_latest=True,\n", " # scrape_latest=False,\n",
" # scrape_top=False,\n", " # scrape_top=True,\n",
" # scrape_poster_details=True,\n",
")" ")"
] ]
}, },
@@ -853,7 +1090,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 147, "execution_count": 104,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -861,7 +1098,7 @@
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Saving Tweets to CSV...\n", "Saving Tweets to CSV...\n",
"CSV Saved: ./tweets/2023-09-23_09-54-41_tweets_1-50.csv\n" "CSV Saved: ./tweets/2023-09-24_23-47-18_tweets_1-50.csv\n"
] ]
} }
], ],
@@ -871,7 +1108,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 148, "execution_count": 105,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [