headless twitter scraper

This commit is contained in:
Jarrian
2023-09-09 00:02:46 +08:00
parent 1349a8ef50
commit bea29a4bf4

View File

@@ -2,7 +2,7 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 103, "execution_count": 52,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -29,7 +29,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 104, "execution_count": 53,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -54,7 +54,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 105, "execution_count": 54,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -65,12 +65,18 @@
" self.last_position = driver.execute_script(\"return window.pageYOffset;\")\n", " self.last_position = driver.execute_script(\"return window.pageYOffset;\")\n",
" self.scrolling = True\n", " self.scrolling = True\n",
" self.scroll_count = 0\n", " self.scroll_count = 0\n",
" pass\n",
" \n",
" def reset(self) -> None:\n",
" self.current_position = 0\n",
" self.last_position = self.driver.execute_script(\"return window.pageYOffset;\")\n",
" self.scroll_count = 0\n",
" pass" " pass"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 106, "execution_count": 55,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -180,7 +186,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 107, "execution_count": 56,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -239,7 +245,7 @@
" './/div[@data-testid=\"reply\"]//span'\n", " './/div[@data-testid=\"reply\"]//span'\n",
" ).text\n", " ).text\n",
" except NoSuchElementException:\n", " except NoSuchElementException:\n",
" self.reply_cnt = 0\n", " self.reply_cnt = '0'\n",
" \n", " \n",
" try:\n", " try:\n",
" self.retweet_cnt = card.find_element(\n", " self.retweet_cnt = card.find_element(\n",
@@ -247,7 +253,7 @@
" './/div[@data-testid=\"retweet\"]//span'\n", " './/div[@data-testid=\"retweet\"]//span'\n",
" ).text\n", " ).text\n",
" except NoSuchElementException:\n", " except NoSuchElementException:\n",
" self.retweet_cnt = 0\n", " self.retweet_cnt = '0'\n",
" \n", " \n",
" try:\n", " try:\n",
" self.like_cnt = card.find_element(\n", " self.like_cnt = card.find_element(\n",
@@ -255,7 +261,7 @@
" './/div[@data-testid=\"like\"]//span'\n", " './/div[@data-testid=\"like\"]//span'\n",
" ).text\n", " ).text\n",
" except NoSuchElementException:\n", " except NoSuchElementException:\n",
" self.like_cnt = 0\n", " self.like_cnt = '0'\n",
" \n", " \n",
" self.tweet = (\n", " self.tweet = (\n",
" self.user,\n", " self.user,\n",
@@ -273,17 +279,17 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 108, "execution_count": 57,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Progress: [[========================================]] 100.00% 50 of 50\n", "Progress: [[========================================]] 100.00% 500 of 500\n",
"\n", "\n",
"Scraping Complete\n", "Scraping Complete\n",
"Tweets: 50\n" "Tweets: 500\n"
] ]
} }
], ],
@@ -291,15 +297,16 @@
"scraper = Twitter_Scraper(\n", "scraper = Twitter_Scraper(\n",
" username=USER_UNAME,\n", " username=USER_UNAME,\n",
" password=USER_PASSWORD,\n", " password=USER_PASSWORD,\n",
" max_tweets=50\n", " max_tweets=500\n",
")\n", ")\n",
"\n", "\n",
"scraper.go_to_home()\n", "scraper.go_to_home()\n",
"progress = Progress(0, scraper.max_tweets)\n", "progress = Progress(0, scraper.max_tweets)\n",
"progress.print_progress(0)\n",
"\n", "\n",
"while scraper.scroller.scrolling:\n", "while scraper.scroller.scrolling:\n",
" scraper.get_tweets()\n", " scraper.get_tweets()\n",
"\n", " \n",
" for card in scraper.tweet_cards[-15:]:\n", " for card in scraper.tweet_cards[-15:]:\n",
" tweet_id = str(card)\n", " tweet_id = str(card)\n",
" if tweet_id not in scraper.tweet_ids:\n", " if tweet_id not in scraper.tweet_ids:\n",
@@ -313,6 +320,9 @@
" if len(scraper.data) >= scraper.max_tweets:\n", " if len(scraper.data) >= scraper.max_tweets:\n",
" scraper.scroller.scrolling = False\n", " scraper.scroller.scrolling = False\n",
" break\n", " break\n",
" \n",
" if len(scraper.data) % 50 == 0:\n",
" sleep(2)\n",
"\n", "\n",
" if len(scraper.data) >= scraper.max_tweets:\n", " if len(scraper.data) >= scraper.max_tweets:\n",
" break\n", " break\n",
@@ -321,17 +331,20 @@
" \n", " \n",
" while True:\n", " while True:\n",
" scraper.driver.execute_script(\n", " scraper.driver.execute_script(\n",
" 'window.scrollTo(0, document.body.scrollHeight);')\n", " 'window.scrollTo(0, document.body.scrollHeight);'\n",
" )\n",
" sleep(2)\n", " sleep(2)\n",
" scraper.scroller.current_position = scraper.driver.execute_script(\n", " scraper.scroller.current_position = scraper.driver.execute_script(\n",
" \"return window.pageYOffset;\"\n", " \"return window.pageYOffset;\"\n",
" )\n", " )\n",
" \n", " \n",
" if scraper.scroller.last_position == scraper.scroller.current_position:\n", " if scraper.scroller.last_position == scraper.scroller.current_position:\n",
" scraper.scroller.scroll_count += 1\n", " scraper.scroller.scroll_count += 1\n",
" \n", " \n",
" if scraper.scroller.scroll_count >= 3:\n", " if scraper.scroller.scroll_count >= 3:\n",
" scraper.scroller.scrolling = False\n", " scraper.go_to_home()\n",
" sleep(2)\n",
" scraper.scroller.reset()\n",
" break\n", " break\n",
" else:\n", " else:\n",
" sleep(2)\n", " sleep(2)\n",
@@ -339,13 +352,14 @@
" scraper.scroller.last_position = scraper.scroller.current_position\n", " scraper.scroller.last_position = scraper.scroller.current_position\n",
" break\n", " break\n",
"\n", "\n",
"scraper.driver.close()\n",
"print(\"Scraping Complete\")\n", "print(\"Scraping Complete\")\n",
"print(\"Tweets: {}\".format(len(scraper.data)))" "print(\"Tweets: {}\".format(len(scraper.data)))"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 109, "execution_count": 58,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -371,7 +385,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 110, "execution_count": 59,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -387,7 +401,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 111, "execution_count": 60,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [