From da77993c12d937bf7341ccda671a7deddabe326b Mon Sep 17 00:00:00 2001 From: Jarrian Date: Sun, 24 Sep 2023 23:54:12 +0800 Subject: [PATCH] feat: Jupyter scrape followers and following count --- main.ipynb | 375 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 306 insertions(+), 69 deletions(-) diff --git a/main.ipynb b/main.ipynb index 9a60194..9a3e5d0 100644 --- a/main.ipynb +++ b/main.ipynb @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": 140, + "execution_count": 94, "metadata": {}, "outputs": [], "source": [ @@ -36,7 +36,9 @@ " StaleElementReferenceException,\n", " WebDriverException,\n", ")\n", + "from selenium.webdriver.common.action_chains import ActionChains\n", "\n", + "from selenium.webdriver.chrome.webdriver import WebDriver\n", "from selenium.webdriver.chrome.options import Options as ChromeOptions\n", "from selenium.webdriver.chrome.service import Service as ChromeService\n", "\n", @@ -55,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 141, + "execution_count": 95, "metadata": {}, "outputs": [], "source": [ @@ -95,7 +97,7 @@ }, { "cell_type": "code", - "execution_count": 142, + "execution_count": 96, "metadata": {}, "outputs": [], "source": [ @@ -139,27 +141,37 @@ }, { "cell_type": "code", - "execution_count": 143, + "execution_count": 97, "metadata": {}, "outputs": [], "source": [ "class Tweet:\n", - " def __init__(self, card: Chrome) -> None:\n", + " def __init__(\n", + " self,\n", + " card: WebDriver,\n", + " driver: WebDriver,\n", + " actions: ActionChains,\n", + " scrape_poster_details=False\n", + " ) -> None:\n", " self.card = card\n", + " self.error = False\n", + " self.tweet = None\n", "\n", " try:\n", " self.user = card.find_element(\n", " \"xpath\", './/div[@data-testid=\"User-Name\"]//span'\n", " ).text\n", " except NoSuchElementException:\n", - " return\n", + " self.error = True\n", + " self.user = \"skip\"\n", "\n", " try:\n", " self.handle = card.find_element(\n", " \"xpath\", './/span[contains(text(), \"@\")]'\n", " ).text\n", " except NoSuchElementException:\n", - " return\n", + " self.error = True\n", + " self.handle = \"skip\"\n", "\n", " try:\n", " self.date_time = card.find_element(\"xpath\", \".//time\").get_attribute(\n", @@ -170,6 +182,10 @@ " self.is_ad = False\n", " except NoSuchElementException:\n", " self.is_ad = True\n", + " self.error = True\n", + " self.date_time = \"skip\"\n", + " \n", + " if self.error:\n", " return\n", "\n", " try:\n", @@ -267,7 +283,78 @@ " ).get_attribute(\"src\")\n", " except NoSuchElementException:\n", " self.profile_img = \"\"\n", - "\n", + " \n", + " self.following_cnt = \"0\"\n", + " self.followers_cnt = \"0\"\n", + " \n", + " if scrape_poster_details:\n", + " el_name = card.find_element(\n", + " \"xpath\", './/div[@data-testid=\"User-Name\"]//span'\n", + " )\n", + " \n", + " ext_hover_card = False\n", + " ext_following = False\n", + " ext_followers = False\n", + " hover_attempt = 0\n", + " \n", + " while not ext_hover_card or not ext_following or not ext_followers:\n", + " try:\n", + " actions.move_to_element(el_name).perform()\n", + " \n", + " hover_card = driver.find_element(\n", + " \"xpath\",\n", + " '//div[@data-testid=\"hoverCardParent\"]'\n", + " )\n", + " \n", + " ext_hover_card = True\n", + " \n", + " while not ext_following:\n", + " try:\n", + " self.following_cnt = hover_card.find_element(\n", + " \"xpath\",\n", + " './/a[contains(@href, \"/following\")]//span'\n", + " ).text\n", + " \n", + " if self.following_cnt == \"\":\n", + " self.following_cnt = \"0\"\n", + " \n", + " ext_following = True\n", + " except NoSuchElementException:\n", + " continue\n", + " except StaleElementReferenceException:\n", + " self.error = True\n", + " return\n", + " \n", + " while not ext_followers:\n", + " try:\n", + " self.followers_cnt = hover_card.find_element(\n", + " \"xpath\",\n", + " './/a[contains(@href, \"/verified_followers\")]//span'\n", + " ).text\n", + " \n", + " if self.followers_cnt == \"\":\n", + " self.followers_cnt = \"0\"\n", + " \n", + " ext_followers = True\n", + " except NoSuchElementException:\n", + " continue\n", + " except StaleElementReferenceException:\n", + " self.error = True\n", + " return\n", + " except NoSuchElementException:\n", + " if hover_attempt==3:\n", + " self.error\n", + " return\n", + " hover_attempt+=1\n", + " sleep(0.5)\n", + " continue\n", + " except StaleElementReferenceException:\n", + " self.error = True\n", + " return\n", + " \n", + " if ext_hover_card and ext_following and ext_followers:\n", + " actions.reset_actions()\n", + " \n", " self.tweet = (\n", " self.user,\n", " self.handle,\n", @@ -282,6 +369,8 @@ " self.mentions,\n", " self.emojis,\n", " self.profile_img,\n", + " self.following_cnt,\n", + " self.followers_cnt,\n", " )\n", "\n", " pass\n" @@ -299,12 +388,13 @@ }, { "cell_type": "code", - "execution_count": 144, + "execution_count": 98, "metadata": {}, "outputs": [], "source": [ "TWITTER_LOGIN_URL = \"https://twitter.com/i/flow/login\"\n", "\n", + "\n", "class Twitter_Scraper:\n", " def __init__(\n", " self,\n", @@ -314,6 +404,7 @@ " scrape_username=None,\n", " scrape_hashtag=None,\n", " scrape_query=None,\n", + " scrape_poster_details=False,\n", " scrape_latest=True,\n", " scrape_top=False,\n", " ):\n", @@ -329,13 +420,14 @@ " \"hashtag\": None,\n", " \"query\": None,\n", " \"tab\": None,\n", + " \"poster_details\": False,\n", " }\n", " self.max_tweets = max_tweets\n", " self.progress = Progress(0, max_tweets)\n", " self.router = self.go_to_home\n", " self.driver = self._get_driver()\n", + " self.actions = ActionChains(self.driver)\n", " self.scroller = Scroller(self.driver)\n", - " self._login()\n", " self._config_scraper(\n", " max_tweets,\n", " scrape_username,\n", @@ -343,6 +435,7 @@ " scrape_query,\n", " scrape_latest,\n", " scrape_top,\n", + " scrape_poster_details,\n", " )\n", "\n", " def _config_scraper(\n", @@ -353,6 +446,7 @@ " scrape_query=None,\n", " scrape_latest=True,\n", " scrape_top=False,\n", + " scrape_poster_details=False,\n", " ):\n", " self.tweet_ids = set()\n", " self.data = []\n", @@ -367,6 +461,7 @@ " else None,\n", " \"query\": scrape_query,\n", " \"tab\": \"Latest\" if scrape_latest else \"Top\" if scrape_top else \"Latest\",\n", + " \"poster_details\": scrape_poster_details,\n", " }\n", " self.router = self.go_to_home\n", " self.scroller = Scroller(self.driver)\n", @@ -408,6 +503,7 @@ " options=browser_option,\n", " )\n", "\n", + " print(\"WebDriver Setup Complete\")\n", " return driver\n", " except WebDriverException:\n", " try:\n", @@ -421,17 +517,20 @@ " options=browser_option,\n", " )\n", "\n", + " print(\"WebDriver Setup Complete\")\n", " return driver\n", " except Exception as e:\n", " print(f\"Error setting up WebDriver: {e}\")\n", " sys.exit(1)\n", + " pass\n", "\n", - " def _login(self):\n", + " def login(self):\n", + " print()\n", " print(\"Logging in to Twitter...\")\n", "\n", " try:\n", - " self.driver.get(TWITTER_LOGIN_URL)\n", " self.driver.maximize_window()\n", + " self.driver.get(TWITTER_LOGIN_URL)\n", " sleep(3)\n", "\n", " self._input_username()\n", @@ -594,10 +693,24 @@ "\n", " def get_tweet_cards(self):\n", " self.tweet_cards = self.driver.find_elements(\n", - " \"xpath\", '//article[@data-testid=\"tweet\"]'\n", + " \"xpath\", '//article[@data-testid=\"tweet\" and not(@disabled)]'\n", " )\n", " pass\n", "\n", + " def remove_hidden_cards(self):\n", + " try:\n", + " hidden_cards = self.driver.find_elements(\n", + " \"xpath\", '//article[@data-testid=\"tweet\" and @disabled]'\n", + " )\n", + "\n", + " for card in hidden_cards[1:-2]:\n", + " self.driver.execute_script(\n", + " \"arguments[0].parentNode.parentNode.parentNode.remove();\", card\n", + " )\n", + " except Exception as e:\n", + " return\n", + " pass\n", + "\n", " def scrape_tweets(\n", " self,\n", " max_tweets=50,\n", @@ -606,6 +719,7 @@ " scrape_query=None,\n", " scrape_latest=True,\n", " scrape_top=False,\n", + " scrape_poster_details=False,\n", " router=None,\n", " ):\n", " self._config_scraper(\n", @@ -615,6 +729,7 @@ " scrape_query,\n", " scrape_latest,\n", " scrape_top,\n", + " scrape_poster_details,\n", " )\n", "\n", " if router is None:\n", @@ -645,6 +760,8 @@ "\n", " refresh_count = 0\n", " added_tweets = 0\n", + " empty_count = 0\n", + " # stale_count = 0\n", "\n", " while self.scroller.scrolling:\n", " try:\n", @@ -652,62 +769,159 @@ " added_tweets = 0\n", "\n", " for card in self.tweet_cards[-15:]:\n", - " tweet = Tweet(card)\n", - "\n", " try:\n", - " tweet_id = f\"{tweet.user}{tweet.handle}{tweet.date_time}\"\n", - " except Exception as e:\n", + " tweet_id = str(card)\n", + "\n", + " # def hide_card(el):\n", + " # self.driver.execute_script(\n", + " # \"arguments[0].setAttribute('disabled', true);\", el\n", + " # )\n", + "\n", + " # self.driver.execute_script(\n", + " # \"arguments[0].parentElement.parentElement.parentElement.style.display='none';\", el\n", + " # )\n", + "\n", + " if tweet_id not in self.tweet_ids:\n", + " self.tweet_ids.add(tweet_id)\n", + "\n", + " if not self.scraper_details[\"poster_details\"]:\n", + " self.driver.execute_script(\n", + " \"arguments[0].scrollIntoView();\", card\n", + " )\n", + "\n", + " tweet = Tweet(\n", + " card=card,\n", + " driver=self.driver,\n", + " actions=self.actions,\n", + " scrape_poster_details=self.scraper_details[\n", + " \"poster_details\"\n", + " ],\n", + " )\n", + "\n", + " if tweet:\n", + " # try:\n", + " # tweet_sig = f\"{tweet.user}|{tweet.handle}|{tweet.date_time}|{tweet.is_ad}\"\n", + " # except Exception as e:\n", + " # continue\n", + "\n", + " # if tweet_sig not in self.tweet_ids:\n", + " # self.tweet_ids.add(tweet_sig)\n", + "\n", + " if not tweet.error and tweet.tweet is not None:\n", + " if not tweet.is_ad:\n", + " self.data.append(tweet.tweet)\n", + " added_tweets += 1\n", + " self.progress.print_progress(len(self.data))\n", + "\n", + " if len(self.data) >= self.max_tweets:\n", + " self.scroller.scrolling = False\n", + " break\n", + " else:\n", + " continue\n", + " else:\n", + " continue\n", + " # else:\n", + " # continue\n", + " else:\n", + " continue\n", + " else:\n", + " continue\n", + " # hide_card(card)\n", + " except NoSuchElementException:\n", " continue\n", "\n", - " if tweet_id not in self.tweet_ids:\n", - " self.tweet_ids.add(tweet_id)\n", - " if tweet:\n", - " if not tweet.is_ad:\n", - " self.data.append(tweet.tweet)\n", - " added_tweets += 1\n", - " self.progress.print_progress(len(self.data))\n", - "\n", - " if len(self.data) >= self.max_tweets:\n", - " self.scroller.scrolling = False\n", - " break\n", - "\n", - " if len(self.data) % 50 == 0:\n", - " sleep(2)\n", - "\n", " if len(self.data) >= self.max_tweets:\n", " break\n", "\n", + " # self.remove_hidden_cards()\n", + "\n", + " # if added_tweets == 0:\n", + " # refresh_count += 1\n", + "\n", + " # if len(self.tweet_cards) > 0:\n", + " # self.driver.execute_script(\n", + " # \"arguments[0].scrollIntoView();\", self.tweet_cards[-1]\n", + " # )\n", + " # sleep(2)\n", + "\n", + " # sleep(1)\n", + "\n", + " # if refresh_count >= 10:\n", + " # print()\n", + " # print(\"No more tweets to scrape\")\n", + " # break\n", + " # else:\n", + " # refresh_count = 0\n", + "\n", + " # if len(self.tweet_cards) == 0:\n", + " # empty_count += 1\n", + " # sleep(1)\n", + "\n", + " # if empty_count >= 3:\n", + " # router()\n", + " # sleep(2)\n", + " # break\n", + " # else:\n", + " # empty_count = 0\n", + "\n", + " # if added_tweets == 0:\n", + " # refresh_count += 1\n", + " # sleep(1)\n", + " # if refresh_count >= 10:\n", + " # print()\n", + " # print(\"No more tweets to scrape\")\n", + " # break\n", + " # else:\n", + " # refresh_count = 0\n", + "\n", " if added_tweets == 0:\n", - " refresh_count += 1\n", - " if refresh_count >= 10:\n", - " print()\n", - " print(\"No more tweets to scrape\")\n", - " break\n", + " if empty_count >= 5:\n", + " if refresh_count >= 3:\n", + " print()\n", + " print(\"No more tweets to scrape\")\n", + " break\n", + " # router()\n", + " # sleep(2)\n", + " refresh_count += 1\n", + " empty_count += 1\n", + " sleep(1)\n", " else:\n", + " empty_count = 0\n", " refresh_count = 0\n", "\n", - " self.scroller.scroll_count = 0\n", + " # self.scroller.scroll_count = 0\n", "\n", - " while True:\n", - " self.scroller.scroll_to_bottom()\n", - " sleep(2)\n", - " self.scroller.update_scroll_position()\n", + " # while True:\n", + " # self.scroller.scroll_to_bottom()\n", + " # sleep(2)\n", + " # self.scroller.update_scroll_position()\n", "\n", - " if self.scroller.last_position == self.scroller.current_position:\n", - " self.scroller.scroll_count += 1\n", + " # if self.scroller.last_position == self.scroller.current_position:\n", + " # # self.scroller.scroll_count += 1\n", "\n", - " if self.scroller.scroll_count >= 3:\n", - " router()\n", - " sleep(2)\n", - " break\n", - " else:\n", - " sleep(1)\n", - " else:\n", - " self.scroller.last_position = self.scroller.current_position\n", - " break\n", + " # # if self.scroller.scroll_count >= 3:\n", + " # # router()\n", + " # # sleep(2)\n", + " # # break\n", + " # # else:\n", + " # # sleep(1)\n", + " # sleep(2)\n", + " # else:\n", + " # self.scroller.last_position = self.scroller.current_position\n", + " # break\n", " except StaleElementReferenceException:\n", - " router()\n", + " # stale_count += 1\n", + "\n", + " # if stale_count >= 3:\n", + " # router()\n", + " # stale_count = 0\n", + "\n", " sleep(2)\n", + " continue\n", + " except KeyboardInterrupt:\n", + " print(\"\\n\")\n", + " print(\"Keyboard Interrupt\")\n", + " break\n", " except Exception as e:\n", " print(\"\\n\")\n", " print(f\"Error scraping tweets: {e}\")\n", @@ -749,6 +963,10 @@ " \"Profile Image\": [tweet[12] for tweet in self.data],\n", " }\n", "\n", + " if self.scraper_details[\"poster_details\"]:\n", + " data[\"Following\"] = [tweet[13] for tweet in self.data]\n", + " data[\"Followers\"] = [tweet[14] for tweet in self.data]\n", + "\n", " df = pd.DataFrame(data)\n", "\n", " current_time = now.strftime(\"%Y-%m-%d_%H-%M-%S\")\n", @@ -773,7 +991,7 @@ }, { "cell_type": "code", - "execution_count": 145, + "execution_count": 101, "metadata": {}, "outputs": [ { @@ -783,10 +1001,7 @@ "Initializing Twitter Scraper...\n", "Setup WebDriver...\n", "Initializing ChromeDriver...\n", - "Logging in to Twitter...\n", - "\n", - "Login Successful\n", - "\n" + "WebDriver Setup Complete\n" ] } ], @@ -801,11 +1016,33 @@ " # scrape_username=\"something\",\n", " # scrape_hashtag=\"something\",\n", " # scrape_query=\"something\",\n", - " # scrape_latest=True,\n", - " # scrape_top=False,\n", + " # scrape_latest=False,\n", + " # scrape_top=True,\n", + " # scrape_poster_details=True\n", ")" ] }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Logging in to Twitter...\n", + "\n", + "Login Successful\n", + "\n" + ] + } + ], + "source": [ + "scraper.login()" + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -816,7 +1053,7 @@ }, { "cell_type": "code", - "execution_count": 146, + "execution_count": 103, "metadata": {}, "outputs": [ { @@ -833,13 +1070,13 @@ ], "source": [ "scraper.scrape_tweets(\n", - " # max_tweets=10,\n", + " # max_tweets=100,\n", " # scrape_username=\"something\",\n", " # scrape_hashtag=\"something\",\n", - " # scrape_hashtag=\"something\",\n", " # scrape_query=\"something\",\n", - " # scrape_latest=True,\n", - " # scrape_top=False,\n", + " # scrape_latest=False,\n", + " # scrape_top=True,\n", + " # scrape_poster_details=True,\n", ")" ] }, @@ -853,7 +1090,7 @@ }, { "cell_type": "code", - "execution_count": 147, + "execution_count": 104, "metadata": {}, "outputs": [ { @@ -861,7 +1098,7 @@ "output_type": "stream", "text": [ "Saving Tweets to CSV...\n", - "CSV Saved: ./tweets/2023-09-23_09-54-41_tweets_1-50.csv\n" + "CSV Saved: ./tweets/2023-09-24_23-47-18_tweets_1-50.csv\n" ] } ], @@ -871,7 +1108,7 @@ }, { "cell_type": "code", - "execution_count": 148, + "execution_count": 105, "metadata": {}, "outputs": [], "source": [