feat: Jupyter scrape followers and following count

This commit is contained in:
Jarrian
2023-09-24 23:54:12 +08:00
parent 6f36bc33ca
commit da77993c12

View File

@@ -17,7 +17,7 @@
},
{
"cell_type": "code",
"execution_count": 140,
"execution_count": 94,
"metadata": {},
"outputs": [],
"source": [
@@ -36,7 +36,9 @@
" StaleElementReferenceException,\n",
" WebDriverException,\n",
")\n",
"from selenium.webdriver.common.action_chains import ActionChains\n",
"\n",
"from selenium.webdriver.chrome.webdriver import WebDriver\n",
"from selenium.webdriver.chrome.options import Options as ChromeOptions\n",
"from selenium.webdriver.chrome.service import Service as ChromeService\n",
"\n",
@@ -55,7 +57,7 @@
},
{
"cell_type": "code",
"execution_count": 141,
"execution_count": 95,
"metadata": {},
"outputs": [],
"source": [
@@ -95,7 +97,7 @@
},
{
"cell_type": "code",
"execution_count": 142,
"execution_count": 96,
"metadata": {},
"outputs": [],
"source": [
@@ -139,27 +141,37 @@
},
{
"cell_type": "code",
"execution_count": 143,
"execution_count": 97,
"metadata": {},
"outputs": [],
"source": [
"class Tweet:\n",
" def __init__(self, card: Chrome) -> None:\n",
" def __init__(\n",
" self,\n",
" card: WebDriver,\n",
" driver: WebDriver,\n",
" actions: ActionChains,\n",
" scrape_poster_details=False\n",
" ) -> None:\n",
" self.card = card\n",
" self.error = False\n",
" self.tweet = None\n",
"\n",
" try:\n",
" self.user = card.find_element(\n",
" \"xpath\", './/div[@data-testid=\"User-Name\"]//span'\n",
" ).text\n",
" except NoSuchElementException:\n",
" return\n",
" self.error = True\n",
" self.user = \"skip\"\n",
"\n",
" try:\n",
" self.handle = card.find_element(\n",
" \"xpath\", './/span[contains(text(), \"@\")]'\n",
" ).text\n",
" except NoSuchElementException:\n",
" return\n",
" self.error = True\n",
" self.handle = \"skip\"\n",
"\n",
" try:\n",
" self.date_time = card.find_element(\"xpath\", \".//time\").get_attribute(\n",
@@ -170,6 +182,10 @@
" self.is_ad = False\n",
" except NoSuchElementException:\n",
" self.is_ad = True\n",
" self.error = True\n",
" self.date_time = \"skip\"\n",
" \n",
" if self.error:\n",
" return\n",
"\n",
" try:\n",
@@ -267,7 +283,78 @@
" ).get_attribute(\"src\")\n",
" except NoSuchElementException:\n",
" self.profile_img = \"\"\n",
"\n",
" \n",
" self.following_cnt = \"0\"\n",
" self.followers_cnt = \"0\"\n",
" \n",
" if scrape_poster_details:\n",
" el_name = card.find_element(\n",
" \"xpath\", './/div[@data-testid=\"User-Name\"]//span'\n",
" )\n",
" \n",
" ext_hover_card = False\n",
" ext_following = False\n",
" ext_followers = False\n",
" hover_attempt = 0\n",
" \n",
" while not ext_hover_card or not ext_following or not ext_followers:\n",
" try:\n",
" actions.move_to_element(el_name).perform()\n",
" \n",
" hover_card = driver.find_element(\n",
" \"xpath\",\n",
" '//div[@data-testid=\"hoverCardParent\"]'\n",
" )\n",
" \n",
" ext_hover_card = True\n",
" \n",
" while not ext_following:\n",
" try:\n",
" self.following_cnt = hover_card.find_element(\n",
" \"xpath\",\n",
" './/a[contains(@href, \"/following\")]//span'\n",
" ).text\n",
" \n",
" if self.following_cnt == \"\":\n",
" self.following_cnt = \"0\"\n",
" \n",
" ext_following = True\n",
" except NoSuchElementException:\n",
" continue\n",
" except StaleElementReferenceException:\n",
" self.error = True\n",
" return\n",
" \n",
" while not ext_followers:\n",
" try:\n",
" self.followers_cnt = hover_card.find_element(\n",
" \"xpath\",\n",
" './/a[contains(@href, \"/verified_followers\")]//span'\n",
" ).text\n",
" \n",
" if self.followers_cnt == \"\":\n",
" self.followers_cnt = \"0\"\n",
" \n",
" ext_followers = True\n",
" except NoSuchElementException:\n",
" continue\n",
" except StaleElementReferenceException:\n",
" self.error = True\n",
" return\n",
" except NoSuchElementException:\n",
" if hover_attempt==3:\n",
" self.error\n",
" return\n",
" hover_attempt+=1\n",
" sleep(0.5)\n",
" continue\n",
" except StaleElementReferenceException:\n",
" self.error = True\n",
" return\n",
" \n",
" if ext_hover_card and ext_following and ext_followers:\n",
" actions.reset_actions()\n",
" \n",
" self.tweet = (\n",
" self.user,\n",
" self.handle,\n",
@@ -282,6 +369,8 @@
" self.mentions,\n",
" self.emojis,\n",
" self.profile_img,\n",
" self.following_cnt,\n",
" self.followers_cnt,\n",
" )\n",
"\n",
" pass\n"
@@ -299,12 +388,13 @@
},
{
"cell_type": "code",
"execution_count": 144,
"execution_count": 98,
"metadata": {},
"outputs": [],
"source": [
"TWITTER_LOGIN_URL = \"https://twitter.com/i/flow/login\"\n",
"\n",
"\n",
"class Twitter_Scraper:\n",
" def __init__(\n",
" self,\n",
@@ -314,6 +404,7 @@
" scrape_username=None,\n",
" scrape_hashtag=None,\n",
" scrape_query=None,\n",
" scrape_poster_details=False,\n",
" scrape_latest=True,\n",
" scrape_top=False,\n",
" ):\n",
@@ -329,13 +420,14 @@
" \"hashtag\": None,\n",
" \"query\": None,\n",
" \"tab\": None,\n",
" \"poster_details\": False,\n",
" }\n",
" self.max_tweets = max_tweets\n",
" self.progress = Progress(0, max_tweets)\n",
" self.router = self.go_to_home\n",
" self.driver = self._get_driver()\n",
" self.actions = ActionChains(self.driver)\n",
" self.scroller = Scroller(self.driver)\n",
" self._login()\n",
" self._config_scraper(\n",
" max_tweets,\n",
" scrape_username,\n",
@@ -343,6 +435,7 @@
" scrape_query,\n",
" scrape_latest,\n",
" scrape_top,\n",
" scrape_poster_details,\n",
" )\n",
"\n",
" def _config_scraper(\n",
@@ -353,6 +446,7 @@
" scrape_query=None,\n",
" scrape_latest=True,\n",
" scrape_top=False,\n",
" scrape_poster_details=False,\n",
" ):\n",
" self.tweet_ids = set()\n",
" self.data = []\n",
@@ -367,6 +461,7 @@
" else None,\n",
" \"query\": scrape_query,\n",
" \"tab\": \"Latest\" if scrape_latest else \"Top\" if scrape_top else \"Latest\",\n",
" \"poster_details\": scrape_poster_details,\n",
" }\n",
" self.router = self.go_to_home\n",
" self.scroller = Scroller(self.driver)\n",
@@ -408,6 +503,7 @@
" options=browser_option,\n",
" )\n",
"\n",
" print(\"WebDriver Setup Complete\")\n",
" return driver\n",
" except WebDriverException:\n",
" try:\n",
@@ -421,17 +517,20 @@
" options=browser_option,\n",
" )\n",
"\n",
" print(\"WebDriver Setup Complete\")\n",
" return driver\n",
" except Exception as e:\n",
" print(f\"Error setting up WebDriver: {e}\")\n",
" sys.exit(1)\n",
" pass\n",
"\n",
" def _login(self):\n",
" def login(self):\n",
" print()\n",
" print(\"Logging in to Twitter...\")\n",
"\n",
" try:\n",
" self.driver.get(TWITTER_LOGIN_URL)\n",
" self.driver.maximize_window()\n",
" self.driver.get(TWITTER_LOGIN_URL)\n",
" sleep(3)\n",
"\n",
" self._input_username()\n",
@@ -594,10 +693,24 @@
"\n",
" def get_tweet_cards(self):\n",
" self.tweet_cards = self.driver.find_elements(\n",
" \"xpath\", '//article[@data-testid=\"tweet\"]'\n",
" \"xpath\", '//article[@data-testid=\"tweet\" and not(@disabled)]'\n",
" )\n",
" pass\n",
"\n",
" def remove_hidden_cards(self):\n",
" try:\n",
" hidden_cards = self.driver.find_elements(\n",
" \"xpath\", '//article[@data-testid=\"tweet\" and @disabled]'\n",
" )\n",
"\n",
" for card in hidden_cards[1:-2]:\n",
" self.driver.execute_script(\n",
" \"arguments[0].parentNode.parentNode.parentNode.remove();\", card\n",
" )\n",
" except Exception as e:\n",
" return\n",
" pass\n",
"\n",
" def scrape_tweets(\n",
" self,\n",
" max_tweets=50,\n",
@@ -606,6 +719,7 @@
" scrape_query=None,\n",
" scrape_latest=True,\n",
" scrape_top=False,\n",
" scrape_poster_details=False,\n",
" router=None,\n",
" ):\n",
" self._config_scraper(\n",
@@ -615,6 +729,7 @@
" scrape_query,\n",
" scrape_latest,\n",
" scrape_top,\n",
" scrape_poster_details,\n",
" )\n",
"\n",
" if router is None:\n",
@@ -645,6 +760,8 @@
"\n",
" refresh_count = 0\n",
" added_tweets = 0\n",
" empty_count = 0\n",
" # stale_count = 0\n",
"\n",
" while self.scroller.scrolling:\n",
" try:\n",
@@ -652,62 +769,159 @@
" added_tweets = 0\n",
"\n",
" for card in self.tweet_cards[-15:]:\n",
" tweet = Tweet(card)\n",
"\n",
" try:\n",
" tweet_id = f\"{tweet.user}{tweet.handle}{tweet.date_time}\"\n",
" except Exception as e:\n",
" tweet_id = str(card)\n",
"\n",
" # def hide_card(el):\n",
" # self.driver.execute_script(\n",
" # \"arguments[0].setAttribute('disabled', true);\", el\n",
" # )\n",
"\n",
" # self.driver.execute_script(\n",
" # \"arguments[0].parentElement.parentElement.parentElement.style.display='none';\", el\n",
" # )\n",
"\n",
" if tweet_id not in self.tweet_ids:\n",
" self.tweet_ids.add(tweet_id)\n",
"\n",
" if not self.scraper_details[\"poster_details\"]:\n",
" self.driver.execute_script(\n",
" \"arguments[0].scrollIntoView();\", card\n",
" )\n",
"\n",
" tweet = Tweet(\n",
" card=card,\n",
" driver=self.driver,\n",
" actions=self.actions,\n",
" scrape_poster_details=self.scraper_details[\n",
" \"poster_details\"\n",
" ],\n",
" )\n",
"\n",
" if tweet:\n",
" # try:\n",
" # tweet_sig = f\"{tweet.user}|{tweet.handle}|{tweet.date_time}|{tweet.is_ad}\"\n",
" # except Exception as e:\n",
" # continue\n",
"\n",
" # if tweet_sig not in self.tweet_ids:\n",
" # self.tweet_ids.add(tweet_sig)\n",
"\n",
" if not tweet.error and tweet.tweet is not None:\n",
" if not tweet.is_ad:\n",
" self.data.append(tweet.tweet)\n",
" added_tweets += 1\n",
" self.progress.print_progress(len(self.data))\n",
"\n",
" if len(self.data) >= self.max_tweets:\n",
" self.scroller.scrolling = False\n",
" break\n",
" else:\n",
" continue\n",
" else:\n",
" continue\n",
" # else:\n",
" # continue\n",
" else:\n",
" continue\n",
" else:\n",
" continue\n",
" # hide_card(card)\n",
" except NoSuchElementException:\n",
" continue\n",
"\n",
" if tweet_id not in self.tweet_ids:\n",
" self.tweet_ids.add(tweet_id)\n",
" if tweet:\n",
" if not tweet.is_ad:\n",
" self.data.append(tweet.tweet)\n",
" added_tweets += 1\n",
" self.progress.print_progress(len(self.data))\n",
"\n",
" if len(self.data) >= self.max_tweets:\n",
" self.scroller.scrolling = False\n",
" break\n",
"\n",
" if len(self.data) % 50 == 0:\n",
" sleep(2)\n",
"\n",
" if len(self.data) >= self.max_tweets:\n",
" break\n",
"\n",
" # self.remove_hidden_cards()\n",
"\n",
" # if added_tweets == 0:\n",
" # refresh_count += 1\n",
"\n",
" # if len(self.tweet_cards) > 0:\n",
" # self.driver.execute_script(\n",
" # \"arguments[0].scrollIntoView();\", self.tweet_cards[-1]\n",
" # )\n",
" # sleep(2)\n",
"\n",
" # sleep(1)\n",
"\n",
" # if refresh_count >= 10:\n",
" # print()\n",
" # print(\"No more tweets to scrape\")\n",
" # break\n",
" # else:\n",
" # refresh_count = 0\n",
"\n",
" # if len(self.tweet_cards) == 0:\n",
" # empty_count += 1\n",
" # sleep(1)\n",
"\n",
" # if empty_count >= 3:\n",
" # router()\n",
" # sleep(2)\n",
" # break\n",
" # else:\n",
" # empty_count = 0\n",
"\n",
" # if added_tweets == 0:\n",
" # refresh_count += 1\n",
" # sleep(1)\n",
" # if refresh_count >= 10:\n",
" # print()\n",
" # print(\"No more tweets to scrape\")\n",
" # break\n",
" # else:\n",
" # refresh_count = 0\n",
"\n",
" if added_tweets == 0:\n",
" refresh_count += 1\n",
" if refresh_count >= 10:\n",
" print()\n",
" print(\"No more tweets to scrape\")\n",
" break\n",
" if empty_count >= 5:\n",
" if refresh_count >= 3:\n",
" print()\n",
" print(\"No more tweets to scrape\")\n",
" break\n",
" # router()\n",
" # sleep(2)\n",
" refresh_count += 1\n",
" empty_count += 1\n",
" sleep(1)\n",
" else:\n",
" empty_count = 0\n",
" refresh_count = 0\n",
"\n",
" self.scroller.scroll_count = 0\n",
" # self.scroller.scroll_count = 0\n",
"\n",
" while True:\n",
" self.scroller.scroll_to_bottom()\n",
" sleep(2)\n",
" self.scroller.update_scroll_position()\n",
" # while True:\n",
" # self.scroller.scroll_to_bottom()\n",
" # sleep(2)\n",
" # self.scroller.update_scroll_position()\n",
"\n",
" if self.scroller.last_position == self.scroller.current_position:\n",
" self.scroller.scroll_count += 1\n",
" # if self.scroller.last_position == self.scroller.current_position:\n",
" # # self.scroller.scroll_count += 1\n",
"\n",
" if self.scroller.scroll_count >= 3:\n",
" router()\n",
" sleep(2)\n",
" break\n",
" else:\n",
" sleep(1)\n",
" else:\n",
" self.scroller.last_position = self.scroller.current_position\n",
" break\n",
" # # if self.scroller.scroll_count >= 3:\n",
" # # router()\n",
" # # sleep(2)\n",
" # # break\n",
" # # else:\n",
" # # sleep(1)\n",
" # sleep(2)\n",
" # else:\n",
" # self.scroller.last_position = self.scroller.current_position\n",
" # break\n",
" except StaleElementReferenceException:\n",
" router()\n",
" # stale_count += 1\n",
"\n",
" # if stale_count >= 3:\n",
" # router()\n",
" # stale_count = 0\n",
"\n",
" sleep(2)\n",
" continue\n",
" except KeyboardInterrupt:\n",
" print(\"\\n\")\n",
" print(\"Keyboard Interrupt\")\n",
" break\n",
" except Exception as e:\n",
" print(\"\\n\")\n",
" print(f\"Error scraping tweets: {e}\")\n",
@@ -749,6 +963,10 @@
" \"Profile Image\": [tweet[12] for tweet in self.data],\n",
" }\n",
"\n",
" if self.scraper_details[\"poster_details\"]:\n",
" data[\"Following\"] = [tweet[13] for tweet in self.data]\n",
" data[\"Followers\"] = [tweet[14] for tweet in self.data]\n",
"\n",
" df = pd.DataFrame(data)\n",
"\n",
" current_time = now.strftime(\"%Y-%m-%d_%H-%M-%S\")\n",
@@ -773,7 +991,7 @@
},
{
"cell_type": "code",
"execution_count": 145,
"execution_count": 101,
"metadata": {},
"outputs": [
{
@@ -783,10 +1001,7 @@
"Initializing Twitter Scraper...\n",
"Setup WebDriver...\n",
"Initializing ChromeDriver...\n",
"Logging in to Twitter...\n",
"\n",
"Login Successful\n",
"\n"
"WebDriver Setup Complete\n"
]
}
],
@@ -801,11 +1016,33 @@
" # scrape_username=\"something\",\n",
" # scrape_hashtag=\"something\",\n",
" # scrape_query=\"something\",\n",
" # scrape_latest=True,\n",
" # scrape_top=False,\n",
" # scrape_latest=False,\n",
" # scrape_top=True,\n",
" # scrape_poster_details=True\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Logging in to Twitter...\n",
"\n",
"Login Successful\n",
"\n"
]
}
],
"source": [
"scraper.login()"
]
},
{
"attachments": {},
"cell_type": "markdown",
@@ -816,7 +1053,7 @@
},
{
"cell_type": "code",
"execution_count": 146,
"execution_count": 103,
"metadata": {},
"outputs": [
{
@@ -833,13 +1070,13 @@
],
"source": [
"scraper.scrape_tweets(\n",
" # max_tweets=10,\n",
" # max_tweets=100,\n",
" # scrape_username=\"something\",\n",
" # scrape_hashtag=\"something\",\n",
" # scrape_hashtag=\"something\",\n",
" # scrape_query=\"something\",\n",
" # scrape_latest=True,\n",
" # scrape_top=False,\n",
" # scrape_latest=False,\n",
" # scrape_top=True,\n",
" # scrape_poster_details=True,\n",
")"
]
},
@@ -853,7 +1090,7 @@
},
{
"cell_type": "code",
"execution_count": 147,
"execution_count": 104,
"metadata": {},
"outputs": [
{
@@ -861,7 +1098,7 @@
"output_type": "stream",
"text": [
"Saving Tweets to CSV...\n",
"CSV Saved: ./tweets/2023-09-23_09-54-41_tweets_1-50.csv\n"
"CSV Saved: ./tweets/2023-09-24_23-47-18_tweets_1-50.csv\n"
]
}
],
@@ -871,7 +1108,7 @@
},
{
"cell_type": "code",
"execution_count": 148,
"execution_count": 105,
"metadata": {},
"outputs": [],
"source": [