From c896baa165073cb276b509e83a199b7cce053d35 Mon Sep 17 00:00:00 2001 From: Jarrian Date: Wed, 20 Sep 2023 09:27:22 +0800 Subject: [PATCH] update scraper configurator --- main.ipynb | 158 ++++++++++++++++++++++++++++--------- scraper/__main__.py | 9 ++- scraper/twitter_scraper.py | 108 ++++++++++++++++++++----- 3 files changed, 216 insertions(+), 59 deletions(-) diff --git a/main.ipynb b/main.ipynb index c1cde67..b3fc24d 100644 --- a/main.ipynb +++ b/main.ipynb @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -55,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -95,7 +95,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -139,7 +139,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -264,13 +264,12 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "TWITTER_LOGIN_URL = \"https://twitter.com/i/flow/login\"\n", "\n", - "\n", "class Twitter_Scraper:\n", " def __init__(\n", " self,\n", @@ -286,7 +285,45 @@ " print(\"Initializing Twitter Scraper...\")\n", " self.username = username\n", " self.password = password\n", + " self.tweet_ids = set()\n", " self.data = []\n", + " self.tweet_cards = []\n", + " self.scraper_details = {\n", + " \"type\": None,\n", + " \"username\": None,\n", + " \"hashtag\": None,\n", + " \"query\": None,\n", + " \"tab\": None,\n", + " }\n", + " self.max_tweets = max_tweets\n", + " self.progress = Progress(0, max_tweets)\n", + " self.router = self.go_to_home\n", + " self.driver = self._get_driver()\n", + " self.scroller = Scroller(self.driver)\n", + " self._login()\n", + " self._config_scraper(\n", + " max_tweets,\n", + " scrape_username,\n", + " scrape_hashtag,\n", + " scrape_query,\n", + " scrape_latest,\n", + " scrape_top,\n", + " )\n", + "\n", + " def _config_scraper(\n", + " self,\n", + " max_tweets=50,\n", + " scrape_username=None,\n", + " scrape_hashtag=None,\n", + " scrape_query=None,\n", + " scrape_latest=True,\n", + " scrape_top=False,\n", + " ):\n", + " self.tweet_ids = set()\n", + " self.data = []\n", + " self.tweet_cards = []\n", + " self.max_tweets = max_tweets\n", + " self.progress = Progress(0, max_tweets)\n", " self.scraper_details = {\n", " \"type\": None,\n", " \"username\": scrape_username,\n", @@ -297,13 +334,6 @@ " \"tab\": \"Latest\" if scrape_latest else \"Top\" if scrape_top else \"Latest\",\n", " }\n", " self.router = self.go_to_home\n", - " self.tweet_ids = set()\n", - " self.max_tweets = max_tweets\n", - " self.progress = Progress(0, max_tweets)\n", - " self.tweet_cards = []\n", - " self.driver = self._get_driver()\n", - " self.scroller = Scroller(self.driver)\n", - " self._login()\n", "\n", " if scrape_username is not None:\n", " self.scraper_details[\"type\"] = \"Username\"\n", @@ -317,6 +347,7 @@ " else:\n", " self.scraper_details[\"type\"] = \"Home\"\n", " self.router = self.go_to_home\n", + " pass\n", "\n", " def _get_driver(self):\n", " print(\"Setup WebDriver...\")\n", @@ -485,26 +516,44 @@ " pass\n", "\n", " def go_to_profile(self):\n", - " self.driver.get(f\"https://twitter.com/{self.scraper_details['username']}\")\n", - " sleep(3)\n", + " if (\n", + " self.scraper_details[\"username\"] is None\n", + " or self.scraper_details[\"username\"] == \"\"\n", + " ):\n", + " print(\"Username is not set.\")\n", + " sys.exit(1)\n", + " else:\n", + " self.driver.get(f\"https://twitter.com/{self.scraper_details['username']}\")\n", + " sleep(3)\n", " pass\n", "\n", " def go_to_hashtag(self):\n", - " url = f\"https://twitter.com/hashtag/{self.scraper_details['hashtag']}?src=hashtag_click\"\n", - " if self.scraper_details[\"tab\"] == \"Latest\":\n", - " url += \"&f=live\"\n", + " if (\n", + " self.scraper_details[\"hashtag\"] is None\n", + " or self.scraper_details[\"hashtag\"] == \"\"\n", + " ):\n", + " print(\"Hashtag is not set.\")\n", + " sys.exit(1)\n", + " else:\n", + " url = f\"https://twitter.com/hashtag/{self.scraper_details['hashtag']}?src=hashtag_click\"\n", + " if self.scraper_details[\"tab\"] == \"Latest\":\n", + " url += \"&f=live\"\n", "\n", - " self.driver.get(url)\n", - " sleep(3)\n", + " self.driver.get(url)\n", + " sleep(3)\n", " pass\n", "\n", " def go_to_search(self):\n", - " url = f\"https://twitter.com/search?q={self.scraper_details['query']}&src=typed_query\"\n", - " if self.scraper_details[\"tab\"] == \"Latest\":\n", - " url += \"&f=live\"\n", + " if self.scraper_details[\"query\"] is None or self.scraper_details[\"query\"] == \"\":\n", + " print(\"Query is not set.\")\n", + " sys.exit(1)\n", + " else:\n", + " url = f\"https://twitter.com/search?q={self.scraper_details['query']}&src=typed_query\"\n", + " if self.scraper_details[\"tab\"] == \"Latest\":\n", + " url += \"&f=live\"\n", "\n", - " self.driver.get(url)\n", - " sleep(3)\n", + " self.driver.get(url)\n", + " sleep(3)\n", " pass\n", "\n", " def get_tweet_cards(self):\n", @@ -513,7 +562,25 @@ " )\n", " pass\n", "\n", - " def scrape_tweets(self, router=None):\n", + " def scrape_tweets(\n", + " self,\n", + " max_tweets=50,\n", + " scrape_username=None,\n", + " scrape_hashtag=None,\n", + " scrape_query=None,\n", + " scrape_latest=True,\n", + " scrape_top=False,\n", + " router=None,\n", + " ):\n", + " self._config_scraper(\n", + " max_tweets,\n", + " scrape_username,\n", + " scrape_hashtag,\n", + " scrape_query,\n", + " scrape_latest,\n", + " scrape_top,\n", + " )\n", + "\n", " if router is None:\n", " router = self.router\n", "\n", @@ -655,7 +722,7 @@ " pass\n", "\n", " def get_tweets(self):\n", - " return self.data\n" + " return self.data" ] }, { @@ -668,7 +735,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -692,8 +759,8 @@ "scraper = Twitter_Scraper(\n", " username=USER_UNAME,\n", " password=USER_PASSWORD,\n", - " max_tweets=10,\n", - " # scrape_username=\"something\",\n", + " # max_tweets=10,\n", + " # scrape_username=\"\",\n", " # scrape_hashtag=\"something\",\n", " # scrape_query=\"something\",\n", " # scrape_latest=True,\n", @@ -711,7 +778,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -719,15 +786,22 @@ "output_type": "stream", "text": [ "Scraping Tweets from Home...\n", - "Progress: [[========================================]] 100.00% 10 of 10\n", + "Progress: [[========================================]] 100.00% 50 of 50\n", "Scraping Complete\n", - "Tweets: 10 out of 10\n", + "Tweets: 50 out of 50\n", "\n" ] } ], "source": [ - "scraper.scrape_tweets()" + "scraper.scrape_tweets(\n", + " # max_tweets=10,\n", + " # scrape_username=\"something\",\n", + " # scrape_hashtag=\"something\",\n", + " # scrape_query=\"something\",\n", + " # scrape_latest=True,\n", + " # scrape_top=False,\n", + ")" ] }, { @@ -740,7 +814,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -748,12 +822,20 @@ "output_type": "stream", "text": [ "Saving Tweets to CSV...\n", - "CSV Saved: ./tweets/2023-09-13_17-14-51_tweets_1-10.csv\n" + "CSV Saved: ./tweets/2023-09-20_09-26-30_tweets_1-50.csv\n" ] } ], "source": [ - "scraper.save_to_csv()\n", + "scraper.save_to_csv()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "scraper.driver.close()" ] } @@ -774,7 +856,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.11.5" }, "orig_nbformat": 4 }, diff --git a/scraper/__main__.py b/scraper/__main__.py index 41d06ae..da10451 100644 --- a/scraper/__main__.py +++ b/scraper/__main__.py @@ -115,7 +115,14 @@ def main(): scrape_top=args.top, ) - scraper.scrape_tweets() + scraper.scrape_tweets( + max_tweets=args.tweets, + scrape_username=args.username, + scrape_hashtag=args.hashtag, + scrape_query=args.query, + scrape_latest=args.latest, + scrape_top=args.top, + ) scraper.save_to_csv() scraper.driver.close() else: diff --git a/scraper/twitter_scraper.py b/scraper/twitter_scraper.py index d6fcf29..c1fdc97 100644 --- a/scraper/twitter_scraper.py +++ b/scraper/twitter_scraper.py @@ -39,7 +39,45 @@ class Twitter_Scraper: print("Initializing Twitter Scraper...") self.username = username self.password = password + self.tweet_ids = set() self.data = [] + self.tweet_cards = [] + self.scraper_details = { + "type": None, + "username": None, + "hashtag": None, + "query": None, + "tab": None, + } + self.max_tweets = max_tweets + self.progress = Progress(0, max_tweets) + self.router = self.go_to_home + self.driver = self._get_driver() + self.scroller = Scroller(self.driver) + self._login() + self._config_scraper( + max_tweets, + scrape_username, + scrape_hashtag, + scrape_query, + scrape_latest, + scrape_top, + ) + + def _config_scraper( + self, + max_tweets=50, + scrape_username=None, + scrape_hashtag=None, + scrape_query=None, + scrape_latest=True, + scrape_top=False, + ): + self.tweet_ids = set() + self.data = [] + self.tweet_cards = [] + self.max_tweets = max_tweets + self.progress = Progress(0, max_tweets) self.scraper_details = { "type": None, "username": scrape_username, @@ -50,13 +88,6 @@ class Twitter_Scraper: "tab": "Latest" if scrape_latest else "Top" if scrape_top else "Latest", } self.router = self.go_to_home - self.tweet_ids = set() - self.max_tweets = max_tweets - self.progress = Progress(0, max_tweets) - self.tweet_cards = [] - self.driver = self._get_driver() - self.scroller = Scroller(self.driver) - self._login() if scrape_username is not None: self.scraper_details["type"] = "Username" @@ -70,6 +101,7 @@ class Twitter_Scraper: else: self.scraper_details["type"] = "Home" self.router = self.go_to_home + pass def _get_driver(self): print("Setup WebDriver...") @@ -238,26 +270,44 @@ It may be due to the following: pass def go_to_profile(self): - self.driver.get(f"https://twitter.com/{self.scraper_details['username']}") - sleep(3) + if ( + self.scraper_details["username"] is None + or self.scraper_details["username"] == "" + ): + print("Username is not set.") + sys.exit(1) + else: + self.driver.get(f"https://twitter.com/{self.scraper_details['username']}") + sleep(3) pass def go_to_hashtag(self): - url = f"https://twitter.com/hashtag/{self.scraper_details['hashtag']}?src=hashtag_click" - if self.scraper_details["tab"] == "Latest": - url += "&f=live" + if ( + self.scraper_details["hashtag"] is None + or self.scraper_details["hashtag"] == "" + ): + print("Hashtag is not set.") + sys.exit(1) + else: + url = f"https://twitter.com/hashtag/{self.scraper_details['hashtag']}?src=hashtag_click" + if self.scraper_details["tab"] == "Latest": + url += "&f=live" - self.driver.get(url) - sleep(3) + self.driver.get(url) + sleep(3) pass def go_to_search(self): - url = f"https://twitter.com/search?q={self.scraper_details['query']}&src=typed_query" - if self.scraper_details["tab"] == "Latest": - url += "&f=live" + if self.scraper_details["query"] is None or self.scraper_details["query"] == "": + print("Query is not set.") + sys.exit(1) + else: + url = f"https://twitter.com/search?q={self.scraper_details['query']}&src=typed_query" + if self.scraper_details["tab"] == "Latest": + url += "&f=live" - self.driver.get(url) - sleep(3) + self.driver.get(url) + sleep(3) pass def get_tweet_cards(self): @@ -266,7 +316,25 @@ It may be due to the following: ) pass - def scrape_tweets(self, router=None): + def scrape_tweets( + self, + max_tweets=50, + scrape_username=None, + scrape_hashtag=None, + scrape_query=None, + scrape_latest=True, + scrape_top=False, + router=None, + ): + self._config_scraper( + max_tweets, + scrape_username, + scrape_hashtag, + scrape_query, + scrape_latest, + scrape_top, + ) + if router is None: router = self.router