update scraper configurator
This commit is contained in:
158
main.ipynb
158
main.ipynb
@@ -17,7 +17,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -55,7 +55,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -95,7 +95,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -139,7 +139,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 35,
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -264,13 +264,12 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 36,
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"TWITTER_LOGIN_URL = \"https://twitter.com/i/flow/login\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class Twitter_Scraper:\n",
|
||||
" def __init__(\n",
|
||||
" self,\n",
|
||||
@@ -286,7 +285,45 @@
|
||||
" print(\"Initializing Twitter Scraper...\")\n",
|
||||
" self.username = username\n",
|
||||
" self.password = password\n",
|
||||
" self.tweet_ids = set()\n",
|
||||
" self.data = []\n",
|
||||
" self.tweet_cards = []\n",
|
||||
" self.scraper_details = {\n",
|
||||
" \"type\": None,\n",
|
||||
" \"username\": None,\n",
|
||||
" \"hashtag\": None,\n",
|
||||
" \"query\": None,\n",
|
||||
" \"tab\": None,\n",
|
||||
" }\n",
|
||||
" self.max_tweets = max_tweets\n",
|
||||
" self.progress = Progress(0, max_tweets)\n",
|
||||
" self.router = self.go_to_home\n",
|
||||
" self.driver = self._get_driver()\n",
|
||||
" self.scroller = Scroller(self.driver)\n",
|
||||
" self._login()\n",
|
||||
" self._config_scraper(\n",
|
||||
" max_tweets,\n",
|
||||
" scrape_username,\n",
|
||||
" scrape_hashtag,\n",
|
||||
" scrape_query,\n",
|
||||
" scrape_latest,\n",
|
||||
" scrape_top,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" def _config_scraper(\n",
|
||||
" self,\n",
|
||||
" max_tweets=50,\n",
|
||||
" scrape_username=None,\n",
|
||||
" scrape_hashtag=None,\n",
|
||||
" scrape_query=None,\n",
|
||||
" scrape_latest=True,\n",
|
||||
" scrape_top=False,\n",
|
||||
" ):\n",
|
||||
" self.tweet_ids = set()\n",
|
||||
" self.data = []\n",
|
||||
" self.tweet_cards = []\n",
|
||||
" self.max_tweets = max_tweets\n",
|
||||
" self.progress = Progress(0, max_tweets)\n",
|
||||
" self.scraper_details = {\n",
|
||||
" \"type\": None,\n",
|
||||
" \"username\": scrape_username,\n",
|
||||
@@ -297,13 +334,6 @@
|
||||
" \"tab\": \"Latest\" if scrape_latest else \"Top\" if scrape_top else \"Latest\",\n",
|
||||
" }\n",
|
||||
" self.router = self.go_to_home\n",
|
||||
" self.tweet_ids = set()\n",
|
||||
" self.max_tweets = max_tweets\n",
|
||||
" self.progress = Progress(0, max_tweets)\n",
|
||||
" self.tweet_cards = []\n",
|
||||
" self.driver = self._get_driver()\n",
|
||||
" self.scroller = Scroller(self.driver)\n",
|
||||
" self._login()\n",
|
||||
"\n",
|
||||
" if scrape_username is not None:\n",
|
||||
" self.scraper_details[\"type\"] = \"Username\"\n",
|
||||
@@ -317,6 +347,7 @@
|
||||
" else:\n",
|
||||
" self.scraper_details[\"type\"] = \"Home\"\n",
|
||||
" self.router = self.go_to_home\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
" def _get_driver(self):\n",
|
||||
" print(\"Setup WebDriver...\")\n",
|
||||
@@ -485,26 +516,44 @@
|
||||
" pass\n",
|
||||
"\n",
|
||||
" def go_to_profile(self):\n",
|
||||
" self.driver.get(f\"https://twitter.com/{self.scraper_details['username']}\")\n",
|
||||
" sleep(3)\n",
|
||||
" if (\n",
|
||||
" self.scraper_details[\"username\"] is None\n",
|
||||
" or self.scraper_details[\"username\"] == \"\"\n",
|
||||
" ):\n",
|
||||
" print(\"Username is not set.\")\n",
|
||||
" sys.exit(1)\n",
|
||||
" else:\n",
|
||||
" self.driver.get(f\"https://twitter.com/{self.scraper_details['username']}\")\n",
|
||||
" sleep(3)\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
" def go_to_hashtag(self):\n",
|
||||
" url = f\"https://twitter.com/hashtag/{self.scraper_details['hashtag']}?src=hashtag_click\"\n",
|
||||
" if self.scraper_details[\"tab\"] == \"Latest\":\n",
|
||||
" url += \"&f=live\"\n",
|
||||
" if (\n",
|
||||
" self.scraper_details[\"hashtag\"] is None\n",
|
||||
" or self.scraper_details[\"hashtag\"] == \"\"\n",
|
||||
" ):\n",
|
||||
" print(\"Hashtag is not set.\")\n",
|
||||
" sys.exit(1)\n",
|
||||
" else:\n",
|
||||
" url = f\"https://twitter.com/hashtag/{self.scraper_details['hashtag']}?src=hashtag_click\"\n",
|
||||
" if self.scraper_details[\"tab\"] == \"Latest\":\n",
|
||||
" url += \"&f=live\"\n",
|
||||
"\n",
|
||||
" self.driver.get(url)\n",
|
||||
" sleep(3)\n",
|
||||
" self.driver.get(url)\n",
|
||||
" sleep(3)\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
" def go_to_search(self):\n",
|
||||
" url = f\"https://twitter.com/search?q={self.scraper_details['query']}&src=typed_query\"\n",
|
||||
" if self.scraper_details[\"tab\"] == \"Latest\":\n",
|
||||
" url += \"&f=live\"\n",
|
||||
" if self.scraper_details[\"query\"] is None or self.scraper_details[\"query\"] == \"\":\n",
|
||||
" print(\"Query is not set.\")\n",
|
||||
" sys.exit(1)\n",
|
||||
" else:\n",
|
||||
" url = f\"https://twitter.com/search?q={self.scraper_details['query']}&src=typed_query\"\n",
|
||||
" if self.scraper_details[\"tab\"] == \"Latest\":\n",
|
||||
" url += \"&f=live\"\n",
|
||||
"\n",
|
||||
" self.driver.get(url)\n",
|
||||
" sleep(3)\n",
|
||||
" self.driver.get(url)\n",
|
||||
" sleep(3)\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
" def get_tweet_cards(self):\n",
|
||||
@@ -513,7 +562,25 @@
|
||||
" )\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
" def scrape_tweets(self, router=None):\n",
|
||||
" def scrape_tweets(\n",
|
||||
" self,\n",
|
||||
" max_tweets=50,\n",
|
||||
" scrape_username=None,\n",
|
||||
" scrape_hashtag=None,\n",
|
||||
" scrape_query=None,\n",
|
||||
" scrape_latest=True,\n",
|
||||
" scrape_top=False,\n",
|
||||
" router=None,\n",
|
||||
" ):\n",
|
||||
" self._config_scraper(\n",
|
||||
" max_tweets,\n",
|
||||
" scrape_username,\n",
|
||||
" scrape_hashtag,\n",
|
||||
" scrape_query,\n",
|
||||
" scrape_latest,\n",
|
||||
" scrape_top,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" if router is None:\n",
|
||||
" router = self.router\n",
|
||||
"\n",
|
||||
@@ -655,7 +722,7 @@
|
||||
" pass\n",
|
||||
"\n",
|
||||
" def get_tweets(self):\n",
|
||||
" return self.data\n"
|
||||
" return self.data"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -668,7 +735,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -692,8 +759,8 @@
|
||||
"scraper = Twitter_Scraper(\n",
|
||||
" username=USER_UNAME,\n",
|
||||
" password=USER_PASSWORD,\n",
|
||||
" max_tweets=10,\n",
|
||||
" # scrape_username=\"something\",\n",
|
||||
" # max_tweets=10,\n",
|
||||
" # scrape_username=\"\",\n",
|
||||
" # scrape_hashtag=\"something\",\n",
|
||||
" # scrape_query=\"something\",\n",
|
||||
" # scrape_latest=True,\n",
|
||||
@@ -711,7 +778,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -719,15 +786,22 @@
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Scraping Tweets from Home...\n",
|
||||
"Progress: [[========================================]] 100.00% 10 of 10\n",
|
||||
"Progress: [[========================================]] 100.00% 50 of 50\n",
|
||||
"Scraping Complete\n",
|
||||
"Tweets: 10 out of 10\n",
|
||||
"Tweets: 50 out of 50\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"scraper.scrape_tweets()"
|
||||
"scraper.scrape_tweets(\n",
|
||||
" # max_tweets=10,\n",
|
||||
" # scrape_username=\"something\",\n",
|
||||
" # scrape_hashtag=\"something\",\n",
|
||||
" # scrape_query=\"something\",\n",
|
||||
" # scrape_latest=True,\n",
|
||||
" # scrape_top=False,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -740,7 +814,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -748,12 +822,20 @@
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Saving Tweets to CSV...\n",
|
||||
"CSV Saved: ./tweets/2023-09-13_17-14-51_tweets_1-10.csv\n"
|
||||
"CSV Saved: ./tweets/2023-09-20_09-26-30_tweets_1-50.csv\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"scraper.save_to_csv()\n",
|
||||
"scraper.save_to_csv()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"scraper.driver.close()"
|
||||
]
|
||||
}
|
||||
@@ -774,7 +856,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.12"
|
||||
"version": "3.11.5"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user