headless twitter scraper
This commit is contained in:
54
main.ipynb
54
main.ipynb
@@ -2,7 +2,7 @@
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 103,
|
||||
"execution_count": 52,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -29,7 +29,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 104,
|
||||
"execution_count": 53,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -54,7 +54,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 105,
|
||||
"execution_count": 54,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -65,12 +65,18 @@
|
||||
" self.last_position = driver.execute_script(\"return window.pageYOffset;\")\n",
|
||||
" self.scrolling = True\n",
|
||||
" self.scroll_count = 0\n",
|
||||
" pass\n",
|
||||
" \n",
|
||||
" def reset(self) -> None:\n",
|
||||
" self.current_position = 0\n",
|
||||
" self.last_position = self.driver.execute_script(\"return window.pageYOffset;\")\n",
|
||||
" self.scroll_count = 0\n",
|
||||
" pass"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 106,
|
||||
"execution_count": 55,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -180,7 +186,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 107,
|
||||
"execution_count": 56,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -239,7 +245,7 @@
|
||||
" './/div[@data-testid=\"reply\"]//span'\n",
|
||||
" ).text\n",
|
||||
" except NoSuchElementException:\n",
|
||||
" self.reply_cnt = 0\n",
|
||||
" self.reply_cnt = '0'\n",
|
||||
" \n",
|
||||
" try:\n",
|
||||
" self.retweet_cnt = card.find_element(\n",
|
||||
@@ -247,7 +253,7 @@
|
||||
" './/div[@data-testid=\"retweet\"]//span'\n",
|
||||
" ).text\n",
|
||||
" except NoSuchElementException:\n",
|
||||
" self.retweet_cnt = 0\n",
|
||||
" self.retweet_cnt = '0'\n",
|
||||
" \n",
|
||||
" try:\n",
|
||||
" self.like_cnt = card.find_element(\n",
|
||||
@@ -255,7 +261,7 @@
|
||||
" './/div[@data-testid=\"like\"]//span'\n",
|
||||
" ).text\n",
|
||||
" except NoSuchElementException:\n",
|
||||
" self.like_cnt = 0\n",
|
||||
" self.like_cnt = '0'\n",
|
||||
" \n",
|
||||
" self.tweet = (\n",
|
||||
" self.user,\n",
|
||||
@@ -273,17 +279,17 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 108,
|
||||
"execution_count": 57,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Progress: [[========================================]] 100.00% 50 of 50\n",
|
||||
"Progress: [[========================================]] 100.00% 500 of 500\n",
|
||||
"\n",
|
||||
"Scraping Complete\n",
|
||||
"Tweets: 50\n"
|
||||
"Tweets: 500\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -291,15 +297,16 @@
|
||||
"scraper = Twitter_Scraper(\n",
|
||||
" username=USER_UNAME,\n",
|
||||
" password=USER_PASSWORD,\n",
|
||||
" max_tweets=50\n",
|
||||
" max_tweets=500\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"scraper.go_to_home()\n",
|
||||
"progress = Progress(0, scraper.max_tweets)\n",
|
||||
"progress.print_progress(0)\n",
|
||||
"\n",
|
||||
"while scraper.scroller.scrolling:\n",
|
||||
" scraper.get_tweets()\n",
|
||||
"\n",
|
||||
" \n",
|
||||
" for card in scraper.tweet_cards[-15:]:\n",
|
||||
" tweet_id = str(card)\n",
|
||||
" if tweet_id not in scraper.tweet_ids:\n",
|
||||
@@ -313,6 +320,9 @@
|
||||
" if len(scraper.data) >= scraper.max_tweets:\n",
|
||||
" scraper.scroller.scrolling = False\n",
|
||||
" break\n",
|
||||
" \n",
|
||||
" if len(scraper.data) % 50 == 0:\n",
|
||||
" sleep(2)\n",
|
||||
"\n",
|
||||
" if len(scraper.data) >= scraper.max_tweets:\n",
|
||||
" break\n",
|
||||
@@ -321,17 +331,20 @@
|
||||
" \n",
|
||||
" while True:\n",
|
||||
" scraper.driver.execute_script(\n",
|
||||
" 'window.scrollTo(0, document.body.scrollHeight);')\n",
|
||||
" 'window.scrollTo(0, document.body.scrollHeight);'\n",
|
||||
" )\n",
|
||||
" sleep(2)\n",
|
||||
" scraper.scroller.current_position = scraper.driver.execute_script(\n",
|
||||
" \"return window.pageYOffset;\"\n",
|
||||
" \"return window.pageYOffset;\"\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" \n",
|
||||
" if scraper.scroller.last_position == scraper.scroller.current_position:\n",
|
||||
" scraper.scroller.scroll_count += 1\n",
|
||||
" \n",
|
||||
" if scraper.scroller.scroll_count >= 3:\n",
|
||||
" scraper.scroller.scrolling = False\n",
|
||||
" scraper.go_to_home()\n",
|
||||
" sleep(2)\n",
|
||||
" scraper.scroller.reset()\n",
|
||||
" break\n",
|
||||
" else:\n",
|
||||
" sleep(2)\n",
|
||||
@@ -339,13 +352,14 @@
|
||||
" scraper.scroller.last_position = scraper.scroller.current_position\n",
|
||||
" break\n",
|
||||
"\n",
|
||||
"scraper.driver.close()\n",
|
||||
"print(\"Scraping Complete\")\n",
|
||||
"print(\"Tweets: {}\".format(len(scraper.data)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 109,
|
||||
"execution_count": 58,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -371,7 +385,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 110,
|
||||
"execution_count": 59,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -387,7 +401,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 111,
|
||||
"execution_count": 60,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
||||
Reference in New Issue
Block a user