update scraper configurator

This commit is contained in:
Jarrian
2023-09-20 09:27:22 +08:00
parent 2553d26590
commit c896baa165
3 changed files with 216 additions and 59 deletions

View File

@@ -17,7 +17,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 17, "execution_count": 10,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -55,7 +55,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 18, "execution_count": 11,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -95,7 +95,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 19, "execution_count": 12,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -139,7 +139,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 35, "execution_count": 13,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -264,13 +264,12 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 36, "execution_count": 14,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"TWITTER_LOGIN_URL = \"https://twitter.com/i/flow/login\"\n", "TWITTER_LOGIN_URL = \"https://twitter.com/i/flow/login\"\n",
"\n", "\n",
"\n",
"class Twitter_Scraper:\n", "class Twitter_Scraper:\n",
" def __init__(\n", " def __init__(\n",
" self,\n", " self,\n",
@@ -286,7 +285,45 @@
" print(\"Initializing Twitter Scraper...\")\n", " print(\"Initializing Twitter Scraper...\")\n",
" self.username = username\n", " self.username = username\n",
" self.password = password\n", " self.password = password\n",
" self.tweet_ids = set()\n",
" self.data = []\n", " self.data = []\n",
" self.tweet_cards = []\n",
" self.scraper_details = {\n",
" \"type\": None,\n",
" \"username\": None,\n",
" \"hashtag\": None,\n",
" \"query\": None,\n",
" \"tab\": None,\n",
" }\n",
" self.max_tweets = max_tweets\n",
" self.progress = Progress(0, max_tweets)\n",
" self.router = self.go_to_home\n",
" self.driver = self._get_driver()\n",
" self.scroller = Scroller(self.driver)\n",
" self._login()\n",
" self._config_scraper(\n",
" max_tweets,\n",
" scrape_username,\n",
" scrape_hashtag,\n",
" scrape_query,\n",
" scrape_latest,\n",
" scrape_top,\n",
" )\n",
"\n",
" def _config_scraper(\n",
" self,\n",
" max_tweets=50,\n",
" scrape_username=None,\n",
" scrape_hashtag=None,\n",
" scrape_query=None,\n",
" scrape_latest=True,\n",
" scrape_top=False,\n",
" ):\n",
" self.tweet_ids = set()\n",
" self.data = []\n",
" self.tweet_cards = []\n",
" self.max_tweets = max_tweets\n",
" self.progress = Progress(0, max_tweets)\n",
" self.scraper_details = {\n", " self.scraper_details = {\n",
" \"type\": None,\n", " \"type\": None,\n",
" \"username\": scrape_username,\n", " \"username\": scrape_username,\n",
@@ -297,13 +334,6 @@
" \"tab\": \"Latest\" if scrape_latest else \"Top\" if scrape_top else \"Latest\",\n", " \"tab\": \"Latest\" if scrape_latest else \"Top\" if scrape_top else \"Latest\",\n",
" }\n", " }\n",
" self.router = self.go_to_home\n", " self.router = self.go_to_home\n",
" self.tweet_ids = set()\n",
" self.max_tweets = max_tweets\n",
" self.progress = Progress(0, max_tweets)\n",
" self.tweet_cards = []\n",
" self.driver = self._get_driver()\n",
" self.scroller = Scroller(self.driver)\n",
" self._login()\n",
"\n", "\n",
" if scrape_username is not None:\n", " if scrape_username is not None:\n",
" self.scraper_details[\"type\"] = \"Username\"\n", " self.scraper_details[\"type\"] = \"Username\"\n",
@@ -317,6 +347,7 @@
" else:\n", " else:\n",
" self.scraper_details[\"type\"] = \"Home\"\n", " self.scraper_details[\"type\"] = \"Home\"\n",
" self.router = self.go_to_home\n", " self.router = self.go_to_home\n",
" pass\n",
"\n", "\n",
" def _get_driver(self):\n", " def _get_driver(self):\n",
" print(\"Setup WebDriver...\")\n", " print(\"Setup WebDriver...\")\n",
@@ -485,26 +516,44 @@
" pass\n", " pass\n",
"\n", "\n",
" def go_to_profile(self):\n", " def go_to_profile(self):\n",
" self.driver.get(f\"https://twitter.com/{self.scraper_details['username']}\")\n", " if (\n",
" sleep(3)\n", " self.scraper_details[\"username\"] is None\n",
" or self.scraper_details[\"username\"] == \"\"\n",
" ):\n",
" print(\"Username is not set.\")\n",
" sys.exit(1)\n",
" else:\n",
" self.driver.get(f\"https://twitter.com/{self.scraper_details['username']}\")\n",
" sleep(3)\n",
" pass\n", " pass\n",
"\n", "\n",
" def go_to_hashtag(self):\n", " def go_to_hashtag(self):\n",
" url = f\"https://twitter.com/hashtag/{self.scraper_details['hashtag']}?src=hashtag_click\"\n", " if (\n",
" if self.scraper_details[\"tab\"] == \"Latest\":\n", " self.scraper_details[\"hashtag\"] is None\n",
" url += \"&f=live\"\n", " or self.scraper_details[\"hashtag\"] == \"\"\n",
" ):\n",
" print(\"Hashtag is not set.\")\n",
" sys.exit(1)\n",
" else:\n",
" url = f\"https://twitter.com/hashtag/{self.scraper_details['hashtag']}?src=hashtag_click\"\n",
" if self.scraper_details[\"tab\"] == \"Latest\":\n",
" url += \"&f=live\"\n",
"\n", "\n",
" self.driver.get(url)\n", " self.driver.get(url)\n",
" sleep(3)\n", " sleep(3)\n",
" pass\n", " pass\n",
"\n", "\n",
" def go_to_search(self):\n", " def go_to_search(self):\n",
" url = f\"https://twitter.com/search?q={self.scraper_details['query']}&src=typed_query\"\n", " if self.scraper_details[\"query\"] is None or self.scraper_details[\"query\"] == \"\":\n",
" if self.scraper_details[\"tab\"] == \"Latest\":\n", " print(\"Query is not set.\")\n",
" url += \"&f=live\"\n", " sys.exit(1)\n",
" else:\n",
" url = f\"https://twitter.com/search?q={self.scraper_details['query']}&src=typed_query\"\n",
" if self.scraper_details[\"tab\"] == \"Latest\":\n",
" url += \"&f=live\"\n",
"\n", "\n",
" self.driver.get(url)\n", " self.driver.get(url)\n",
" sleep(3)\n", " sleep(3)\n",
" pass\n", " pass\n",
"\n", "\n",
" def get_tweet_cards(self):\n", " def get_tweet_cards(self):\n",
@@ -513,7 +562,25 @@
" )\n", " )\n",
" pass\n", " pass\n",
"\n", "\n",
" def scrape_tweets(self, router=None):\n", " def scrape_tweets(\n",
" self,\n",
" max_tweets=50,\n",
" scrape_username=None,\n",
" scrape_hashtag=None,\n",
" scrape_query=None,\n",
" scrape_latest=True,\n",
" scrape_top=False,\n",
" router=None,\n",
" ):\n",
" self._config_scraper(\n",
" max_tweets,\n",
" scrape_username,\n",
" scrape_hashtag,\n",
" scrape_query,\n",
" scrape_latest,\n",
" scrape_top,\n",
" )\n",
"\n",
" if router is None:\n", " if router is None:\n",
" router = self.router\n", " router = self.router\n",
"\n", "\n",
@@ -655,7 +722,7 @@
" pass\n", " pass\n",
"\n", "\n",
" def get_tweets(self):\n", " def get_tweets(self):\n",
" return self.data\n" " return self.data"
] ]
}, },
{ {
@@ -668,7 +735,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 32, "execution_count": 27,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -692,8 +759,8 @@
"scraper = Twitter_Scraper(\n", "scraper = Twitter_Scraper(\n",
" username=USER_UNAME,\n", " username=USER_UNAME,\n",
" password=USER_PASSWORD,\n", " password=USER_PASSWORD,\n",
" max_tweets=10,\n", " # max_tweets=10,\n",
" # scrape_username=\"something\",\n", " # scrape_username=\"\",\n",
" # scrape_hashtag=\"something\",\n", " # scrape_hashtag=\"something\",\n",
" # scrape_query=\"something\",\n", " # scrape_query=\"something\",\n",
" # scrape_latest=True,\n", " # scrape_latest=True,\n",
@@ -711,7 +778,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 33, "execution_count": 28,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -719,15 +786,22 @@
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Scraping Tweets from Home...\n", "Scraping Tweets from Home...\n",
"Progress: [[========================================]] 100.00% 10 of 10\n", "Progress: [[========================================]] 100.00% 50 of 50\n",
"Scraping Complete\n", "Scraping Complete\n",
"Tweets: 10 out of 10\n", "Tweets: 50 out of 50\n",
"\n" "\n"
] ]
} }
], ],
"source": [ "source": [
"scraper.scrape_tweets()" "scraper.scrape_tweets(\n",
" # max_tweets=10,\n",
" # scrape_username=\"something\",\n",
" # scrape_hashtag=\"something\",\n",
" # scrape_query=\"something\",\n",
" # scrape_latest=True,\n",
" # scrape_top=False,\n",
")"
] ]
}, },
{ {
@@ -740,7 +814,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 34, "execution_count": 29,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -748,12 +822,20 @@
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Saving Tweets to CSV...\n", "Saving Tweets to CSV...\n",
"CSV Saved: ./tweets/2023-09-13_17-14-51_tweets_1-10.csv\n" "CSV Saved: ./tweets/2023-09-20_09-26-30_tweets_1-50.csv\n"
] ]
} }
], ],
"source": [ "source": [
"scraper.save_to_csv()\n", "scraper.save_to_csv()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"scraper.driver.close()" "scraper.driver.close()"
] ]
} }
@@ -774,7 +856,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.10.12" "version": "3.11.5"
}, },
"orig_nbformat": 4 "orig_nbformat": 4
}, },

View File

@@ -115,7 +115,14 @@ def main():
scrape_top=args.top, scrape_top=args.top,
) )
scraper.scrape_tweets() scraper.scrape_tweets(
max_tweets=args.tweets,
scrape_username=args.username,
scrape_hashtag=args.hashtag,
scrape_query=args.query,
scrape_latest=args.latest,
scrape_top=args.top,
)
scraper.save_to_csv() scraper.save_to_csv()
scraper.driver.close() scraper.driver.close()
else: else:

View File

@@ -39,7 +39,45 @@ class Twitter_Scraper:
print("Initializing Twitter Scraper...") print("Initializing Twitter Scraper...")
self.username = username self.username = username
self.password = password self.password = password
self.tweet_ids = set()
self.data = [] self.data = []
self.tweet_cards = []
self.scraper_details = {
"type": None,
"username": None,
"hashtag": None,
"query": None,
"tab": None,
}
self.max_tweets = max_tweets
self.progress = Progress(0, max_tweets)
self.router = self.go_to_home
self.driver = self._get_driver()
self.scroller = Scroller(self.driver)
self._login()
self._config_scraper(
max_tweets,
scrape_username,
scrape_hashtag,
scrape_query,
scrape_latest,
scrape_top,
)
def _config_scraper(
self,
max_tweets=50,
scrape_username=None,
scrape_hashtag=None,
scrape_query=None,
scrape_latest=True,
scrape_top=False,
):
self.tweet_ids = set()
self.data = []
self.tweet_cards = []
self.max_tweets = max_tweets
self.progress = Progress(0, max_tweets)
self.scraper_details = { self.scraper_details = {
"type": None, "type": None,
"username": scrape_username, "username": scrape_username,
@@ -50,13 +88,6 @@ class Twitter_Scraper:
"tab": "Latest" if scrape_latest else "Top" if scrape_top else "Latest", "tab": "Latest" if scrape_latest else "Top" if scrape_top else "Latest",
} }
self.router = self.go_to_home self.router = self.go_to_home
self.tweet_ids = set()
self.max_tweets = max_tweets
self.progress = Progress(0, max_tweets)
self.tweet_cards = []
self.driver = self._get_driver()
self.scroller = Scroller(self.driver)
self._login()
if scrape_username is not None: if scrape_username is not None:
self.scraper_details["type"] = "Username" self.scraper_details["type"] = "Username"
@@ -70,6 +101,7 @@ class Twitter_Scraper:
else: else:
self.scraper_details["type"] = "Home" self.scraper_details["type"] = "Home"
self.router = self.go_to_home self.router = self.go_to_home
pass
def _get_driver(self): def _get_driver(self):
print("Setup WebDriver...") print("Setup WebDriver...")
@@ -238,26 +270,44 @@ It may be due to the following:
pass pass
def go_to_profile(self): def go_to_profile(self):
self.driver.get(f"https://twitter.com/{self.scraper_details['username']}") if (
sleep(3) self.scraper_details["username"] is None
or self.scraper_details["username"] == ""
):
print("Username is not set.")
sys.exit(1)
else:
self.driver.get(f"https://twitter.com/{self.scraper_details['username']}")
sleep(3)
pass pass
def go_to_hashtag(self): def go_to_hashtag(self):
url = f"https://twitter.com/hashtag/{self.scraper_details['hashtag']}?src=hashtag_click" if (
if self.scraper_details["tab"] == "Latest": self.scraper_details["hashtag"] is None
url += "&f=live" or self.scraper_details["hashtag"] == ""
):
print("Hashtag is not set.")
sys.exit(1)
else:
url = f"https://twitter.com/hashtag/{self.scraper_details['hashtag']}?src=hashtag_click"
if self.scraper_details["tab"] == "Latest":
url += "&f=live"
self.driver.get(url) self.driver.get(url)
sleep(3) sleep(3)
pass pass
def go_to_search(self): def go_to_search(self):
url = f"https://twitter.com/search?q={self.scraper_details['query']}&src=typed_query" if self.scraper_details["query"] is None or self.scraper_details["query"] == "":
if self.scraper_details["tab"] == "Latest": print("Query is not set.")
url += "&f=live" sys.exit(1)
else:
url = f"https://twitter.com/search?q={self.scraper_details['query']}&src=typed_query"
if self.scraper_details["tab"] == "Latest":
url += "&f=live"
self.driver.get(url) self.driver.get(url)
sleep(3) sleep(3)
pass pass
def get_tweet_cards(self): def get_tweet_cards(self):
@@ -266,7 +316,25 @@ It may be due to the following:
) )
pass pass
def scrape_tweets(self, router=None): def scrape_tweets(
self,
max_tweets=50,
scrape_username=None,
scrape_hashtag=None,
scrape_query=None,
scrape_latest=True,
scrape_top=False,
router=None,
):
self._config_scraper(
max_tweets,
scrape_username,
scrape_hashtag,
scrape_query,
scrape_latest,
scrape_top,
)
if router is None: if router is None:
router = self.router router = self.router