headless twitter scraper

This commit is contained in:
Jarrian
2023-09-09 00:02:46 +08:00
parent 1349a8ef50
commit bea29a4bf4

View File

@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 103,
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
@@ -29,7 +29,7 @@
},
{
"cell_type": "code",
"execution_count": 104,
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
@@ -54,7 +54,7 @@
},
{
"cell_type": "code",
"execution_count": 105,
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
@@ -65,12 +65,18 @@
" self.last_position = driver.execute_script(\"return window.pageYOffset;\")\n",
" self.scrolling = True\n",
" self.scroll_count = 0\n",
" pass\n",
" \n",
" def reset(self) -> None:\n",
" self.current_position = 0\n",
" self.last_position = self.driver.execute_script(\"return window.pageYOffset;\")\n",
" self.scroll_count = 0\n",
" pass"
]
},
{
"cell_type": "code",
"execution_count": 106,
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
@@ -180,7 +186,7 @@
},
{
"cell_type": "code",
"execution_count": 107,
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
@@ -239,7 +245,7 @@
" './/div[@data-testid=\"reply\"]//span'\n",
" ).text\n",
" except NoSuchElementException:\n",
" self.reply_cnt = 0\n",
" self.reply_cnt = '0'\n",
" \n",
" try:\n",
" self.retweet_cnt = card.find_element(\n",
@@ -247,7 +253,7 @@
" './/div[@data-testid=\"retweet\"]//span'\n",
" ).text\n",
" except NoSuchElementException:\n",
" self.retweet_cnt = 0\n",
" self.retweet_cnt = '0'\n",
" \n",
" try:\n",
" self.like_cnt = card.find_element(\n",
@@ -255,7 +261,7 @@
" './/div[@data-testid=\"like\"]//span'\n",
" ).text\n",
" except NoSuchElementException:\n",
" self.like_cnt = 0\n",
" self.like_cnt = '0'\n",
" \n",
" self.tweet = (\n",
" self.user,\n",
@@ -273,17 +279,17 @@
},
{
"cell_type": "code",
"execution_count": 108,
"execution_count": 57,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Progress: [[========================================]] 100.00% 50 of 50\n",
"Progress: [[========================================]] 100.00% 500 of 500\n",
"\n",
"Scraping Complete\n",
"Tweets: 50\n"
"Tweets: 500\n"
]
}
],
@@ -291,15 +297,16 @@
"scraper = Twitter_Scraper(\n",
" username=USER_UNAME,\n",
" password=USER_PASSWORD,\n",
" max_tweets=50\n",
" max_tweets=500\n",
")\n",
"\n",
"scraper.go_to_home()\n",
"progress = Progress(0, scraper.max_tweets)\n",
"progress.print_progress(0)\n",
"\n",
"while scraper.scroller.scrolling:\n",
" scraper.get_tweets()\n",
"\n",
" \n",
" for card in scraper.tweet_cards[-15:]:\n",
" tweet_id = str(card)\n",
" if tweet_id not in scraper.tweet_ids:\n",
@@ -313,6 +320,9 @@
" if len(scraper.data) >= scraper.max_tweets:\n",
" scraper.scroller.scrolling = False\n",
" break\n",
" \n",
" if len(scraper.data) % 50 == 0:\n",
" sleep(2)\n",
"\n",
" if len(scraper.data) >= scraper.max_tweets:\n",
" break\n",
@@ -321,17 +331,20 @@
" \n",
" while True:\n",
" scraper.driver.execute_script(\n",
" 'window.scrollTo(0, document.body.scrollHeight);')\n",
" 'window.scrollTo(0, document.body.scrollHeight);'\n",
" )\n",
" sleep(2)\n",
" scraper.scroller.current_position = scraper.driver.execute_script(\n",
" \"return window.pageYOffset;\"\n",
" \"return window.pageYOffset;\"\n",
" )\n",
" \n",
" \n",
" if scraper.scroller.last_position == scraper.scroller.current_position:\n",
" scraper.scroller.scroll_count += 1\n",
" \n",
" if scraper.scroller.scroll_count >= 3:\n",
" scraper.scroller.scrolling = False\n",
" scraper.go_to_home()\n",
" sleep(2)\n",
" scraper.scroller.reset()\n",
" break\n",
" else:\n",
" sleep(2)\n",
@@ -339,13 +352,14 @@
" scraper.scroller.last_position = scraper.scroller.current_position\n",
" break\n",
"\n",
"scraper.driver.close()\n",
"print(\"Scraping Complete\")\n",
"print(\"Tweets: {}\".format(len(scraper.data)))"
]
},
{
"cell_type": "code",
"execution_count": 109,
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
@@ -371,7 +385,7 @@
},
{
"cell_type": "code",
"execution_count": 110,
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
@@ -387,7 +401,7 @@
},
{
"cell_type": "code",
"execution_count": 111,
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [