From 069b0cc24a89e8a70c1e5524ee85a156a5cadf5d Mon Sep 17 00:00:00 2001
From: Jarrian <glorianagojar@gmail.com>
Date: Mon, 25 Sep 2023 08:27:08 +0800
Subject: [PATCH] feat: optionally scrape followers and following

---
 main.ipynb                 |  25 +++----
 scraper/__main__.py        |  26 ++++---
 scraper/tweet.py           | 100 ++++++++++++++++++++++++--
 scraper/twitter_scraper.py | 142 ++++++++++++++++++++++++-------------
 4 files changed, 217 insertions(+), 76 deletions(-)

diff --git a/main.ipynb b/main.ipynb
index 3d864ce..4895850 100644
--- a/main.ipynb
+++ b/main.ipynb
@@ -17,7 +17,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 113,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -57,7 +57,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 114,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -97,7 +97,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 115,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -141,7 +141,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 116,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -387,13 +387,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 117,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
     "TWITTER_LOGIN_URL = \"https://twitter.com/i/flow/login\"\n",
     "\n",
-    "\n",
     "class Twitter_Scraper:\n",
     "    def __init__(\n",
     "        self,\n",
@@ -410,6 +409,7 @@
     "        print(\"Initializing Twitter Scraper...\")\n",
     "        self.username = username\n",
     "        self.password = password\n",
+    "        self.interrupted = False\n",
     "        self.tweet_ids = set()\n",
     "        self.data = []\n",
     "        self.tweet_cards = []\n",
@@ -829,6 +829,7 @@
     "            except KeyboardInterrupt:\n",
     "                print(\"\\n\")\n",
     "                print(\"Keyboard Interrupt\")\n",
+    "                self.interrupted = True\n",
     "                break\n",
     "            except Exception as e:\n",
     "                print(\"\\n\")\n",
@@ -899,7 +900,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 118,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -932,7 +933,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 119,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -961,7 +962,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 120,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -998,7 +999,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 121,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -1006,7 +1007,7 @@
      "output_type": "stream",
      "text": [
       "Saving Tweets to CSV...\n",
-      "CSV Saved: ./tweets/2023-09-24_23-57-11_tweets_1-50.csv\n"
+      "CSV Saved: ./tweets/2023-09-25_08-20-51_tweets_1-50.csv\n"
      ]
     }
    ],
@@ -1016,7 +1017,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 122,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
diff --git a/scraper/__main__.py b/scraper/__main__.py
index c54e322..429ecf4 100644
--- a/scraper/__main__.py
+++ b/scraper/__main__.py
@@ -73,6 +73,14 @@ def main():
             help="Twitter query or search. Scrape tweets from a query or search.",
         )
 
+        parser.add_argument(
+            "-a",
+            "--add",
+            type=str,
+            default="",
+            help="Additional data to scrape and save in the .csv file.",
+        )
+
         parser.add_argument(
             "--latest",
             action="store_true",
@@ -107,6 +115,8 @@ def main():
         if args.query is not None:
             tweet_type_args.append(args.query)
 
+        additional_data: list = args.add.split(",")
+
         if len(tweet_type_args) > 1:
             print("Please specify only one of --username, --hashtag, or --query.")
             sys.exit(1)
@@ -119,14 +129,8 @@ def main():
             scraper = Twitter_Scraper(
                 username=USER_UNAME,
                 password=USER_PASSWORD,
-                max_tweets=args.tweets,
-                scrape_username=args.username,
-                scrape_hashtag=args.hashtag,
-                scrape_query=args.query,
-                scrape_latest=args.latest,
-                scrape_top=args.top,
             )
-
+            scraper.login()
             scraper.scrape_tweets(
                 max_tweets=args.tweets,
                 scrape_username=args.username,
@@ -134,9 +138,11 @@ def main():
                 scrape_query=args.query,
                 scrape_latest=args.latest,
                 scrape_top=args.top,
+                scrape_poster_details="pd" in additional_data,
             )
             scraper.save_to_csv()
-            scraper.driver.close()
+            if not scraper.interrupted:
+                scraper.driver.close()
         else:
             print(
                 "Missing Twitter username or password environment variables. Please check your .env file."
@@ -145,6 +151,10 @@ def main():
     except KeyboardInterrupt:
         print("\nScript Interrupted by user. Exiting...")
         sys.exit(1)
+    except Exception as e:
+        print(f"Error: {e}")
+        sys.exit(1)
+    sys.exit(1)
 
 
 if __name__ == "__main__":
diff --git a/scraper/tweet.py b/scraper/tweet.py
index a1e8e08..760c966 100644
--- a/scraper/tweet.py
+++ b/scraper/tweet.py
@@ -1,24 +1,39 @@
-from selenium.webdriver import Chrome
-from selenium.common.exceptions import NoSuchElementException
+from time import sleep
+from selenium.common.exceptions import (
+    NoSuchElementException,
+    StaleElementReferenceException,
+)
+from selenium.webdriver.chrome.webdriver import WebDriver
+from selenium.webdriver.common.action_chains import ActionChains
 
 
 class Tweet:
-    def __init__(self, card: Chrome) -> None:
+    def __init__(
+        self,
+        card: WebDriver,
+        driver: WebDriver,
+        actions: ActionChains,
+        scrape_poster_details=False,
+    ) -> None:
         self.card = card
+        self.error = False
+        self.tweet = None
 
         try:
             self.user = card.find_element(
                 "xpath", './/div[@data-testid="User-Name"]//span'
             ).text
         except NoSuchElementException:
-            return
+            self.error = True
+            self.user = "skip"
 
         try:
             self.handle = card.find_element(
                 "xpath", './/span[contains(text(), "@")]'
             ).text
         except NoSuchElementException:
-            return
+            self.error = True
+            self.handle = "skip"
 
         try:
             self.date_time = card.find_element("xpath", ".//time").get_attribute(
@@ -29,6 +44,10 @@ class Tweet:
                 self.is_ad = False
         except NoSuchElementException:
             self.is_ad = True
+            self.error = True
+            self.date_time = "skip"
+
+        if self.error:
             return
 
         try:
@@ -129,6 +148,75 @@ class Tweet:
         except NoSuchElementException:
             self.profile_img = ""
 
+        self.following_cnt = "0"
+        self.followers_cnt = "0"
+
+        if scrape_poster_details:
+            el_name = card.find_element(
+                "xpath", './/div[@data-testid="User-Name"]//span'
+            )
+
+            ext_hover_card = False
+            ext_following = False
+            ext_followers = False
+            hover_attempt = 0
+
+            while not ext_hover_card or not ext_following or not ext_followers:
+                try:
+                    actions.move_to_element(el_name).perform()
+
+                    hover_card = driver.find_element(
+                        "xpath", '//div[@data-testid="hoverCardParent"]'
+                    )
+
+                    ext_hover_card = True
+
+                    while not ext_following:
+                        try:
+                            self.following_cnt = hover_card.find_element(
+                                "xpath", './/a[contains(@href, "/following")]//span'
+                            ).text
+
+                            if self.following_cnt == "":
+                                self.following_cnt = "0"
+
+                            ext_following = True
+                        except NoSuchElementException:
+                            continue
+                        except StaleElementReferenceException:
+                            self.error = True
+                            return
+
+                    while not ext_followers:
+                        try:
+                            self.followers_cnt = hover_card.find_element(
+                                "xpath",
+                                './/a[contains(@href, "/verified_followers")]//span',
+                            ).text
+
+                            if self.followers_cnt == "":
+                                self.followers_cnt = "0"
+
+                            ext_followers = True
+                        except NoSuchElementException:
+                            continue
+                        except StaleElementReferenceException:
+                            self.error = True
+                            return
+                except NoSuchElementException:
+                    if hover_attempt == 3:
+                        self.error
+                        return
+                    hover_attempt += 1
+                    sleep(0.5)
+                    continue
+                except StaleElementReferenceException:
+                    self.error = True
+                    return
+
+            if ext_hover_card and ext_following and ext_followers:
+                actions.reset_actions()
+
         self.tweet = (
             self.user,
             self.handle,
@@ -143,6 +231,8 @@ class Tweet:
             self.mentions,
             self.emojis,
             self.profile_img,
+            self.following_cnt,
+            self.followers_cnt,
         )
 
         pass
diff --git a/scraper/twitter_scraper.py b/scraper/twitter_scraper.py
index da14b38..b478a05 100644
--- a/scraper/twitter_scraper.py
+++ b/scraper/twitter_scraper.py
@@ -8,6 +8,7 @@ from tweet import Tweet
 from datetime import datetime
 from fake_headers import Headers
 from time import sleep
+
 from selenium import webdriver
 from selenium.webdriver.common.keys import Keys
 from selenium.common.exceptions import (
@@ -15,7 +16,7 @@ from selenium.common.exceptions import (
     StaleElementReferenceException,
     WebDriverException,
 )
-
+from selenium.webdriver.common.action_chains import ActionChains
 from selenium.webdriver.chrome.options import Options as ChromeOptions
 from selenium.webdriver.chrome.service import Service as ChromeService
 
@@ -33,12 +34,14 @@ class Twitter_Scraper:
         scrape_username=None,
         scrape_hashtag=None,
         scrape_query=None,
+        scrape_poster_details=False,
         scrape_latest=True,
         scrape_top=False,
     ):
         print("Initializing Twitter Scraper...")
         self.username = username
         self.password = password
+        self.interrupted = False
         self.tweet_ids = set()
         self.data = []
         self.tweet_cards = []
@@ -48,13 +51,14 @@ class Twitter_Scraper:
             "hashtag": None,
             "query": None,
             "tab": None,
+            "poster_details": False,
         }
         self.max_tweets = max_tweets
         self.progress = Progress(0, max_tweets)
         self.router = self.go_to_home
         self.driver = self._get_driver()
+        self.actions = ActionChains(self.driver)
         self.scroller = Scroller(self.driver)
-        self._login()
         self._config_scraper(
             max_tweets,
             scrape_username,
@@ -62,6 +66,7 @@ class Twitter_Scraper:
             scrape_query,
             scrape_latest,
             scrape_top,
+            scrape_poster_details,
         )
 
     def _config_scraper(
@@ -72,6 +77,7 @@ class Twitter_Scraper:
         scrape_query=None,
         scrape_latest=True,
         scrape_top=False,
+        scrape_poster_details=False,
     ):
         self.tweet_ids = set()
         self.data = []
@@ -86,6 +92,7 @@ class Twitter_Scraper:
             else None,
             "query": scrape_query,
             "tab": "Latest" if scrape_latest else "Top" if scrape_top else "Latest",
+            "poster_details": scrape_poster_details,
         }
         self.router = self.go_to_home
         self.scroller = Scroller(self.driver)
@@ -127,6 +134,7 @@ class Twitter_Scraper:
                 options=browser_option,
             )
 
+            print("WebDriver Setup Complete")
             return driver
         except WebDriverException:
             try:
@@ -140,17 +148,20 @@ class Twitter_Scraper:
                     options=browser_option,
                 )
 
+                print("WebDriver Setup Complete")
                 return driver
             except Exception as e:
                 print(f"Error setting up WebDriver: {e}")
                 sys.exit(1)
+        pass
 
-    def _login(self):
+    def login(self):
+        print()
         print("Logging in to Twitter...")
 
         try:
-            self.driver.get(TWITTER_LOGIN_URL)
             self.driver.maximize_window()
+            self.driver.get(TWITTER_LOGIN_URL)
             sleep(3)
 
             self._input_username()
@@ -313,10 +324,24 @@ It may be due to the following:
 
     def get_tweet_cards(self):
         self.tweet_cards = self.driver.find_elements(
-            "xpath", '//article[@data-testid="tweet"]'
+            "xpath", '//article[@data-testid="tweet" and not(@disabled)]'
         )
         pass
 
+    def remove_hidden_cards(self):
+        try:
+            hidden_cards = self.driver.find_elements(
+                "xpath", '//article[@data-testid="tweet" and @disabled]'
+            )
+
+            for card in hidden_cards[1:-2]:
+                self.driver.execute_script(
+                    "arguments[0].parentNode.parentNode.parentNode.remove();", card
+                )
+        except Exception as e:
+            return
+        pass
+
     def scrape_tweets(
         self,
         max_tweets=50,
@@ -325,6 +350,7 @@ It may be due to the following:
         scrape_query=None,
         scrape_latest=True,
         scrape_top=False,
+        scrape_poster_details=False,
         router=None,
     ):
         self._config_scraper(
@@ -334,6 +360,7 @@ It may be due to the following:
             scrape_query,
             scrape_latest,
             scrape_top,
+            scrape_poster_details,
         )
 
         if router is None:
@@ -364,6 +391,7 @@ It may be due to the following:
 
         refresh_count = 0
         added_tweets = 0
+        empty_count = 0
 
         while self.scroller.scrolling:
             try:
@@ -371,62 +399,70 @@ It may be due to the following:
                 added_tweets = 0
 
                 for card in self.tweet_cards[-15:]:
-                    tweet = Tweet(card)
-
                     try:
-                        tweet_id = f"{tweet.user}{tweet.handle}{tweet.date_time}"
-                    except Exception as e:
+                        tweet_id = str(card)
+
+                        if tweet_id not in self.tweet_ids:
+                            self.tweet_ids.add(tweet_id)
+
+                            if not self.scraper_details["poster_details"]:
+                                self.driver.execute_script(
+                                    "arguments[0].scrollIntoView();", card
+                                )
+
+                            tweet = Tweet(
+                                card=card,
+                                driver=self.driver,
+                                actions=self.actions,
+                                scrape_poster_details=self.scraper_details[
+                                    "poster_details"
+                                ],
+                            )
+
+                            if tweet:
+                                if not tweet.error and tweet.tweet is not None:
+                                    if not tweet.is_ad:
+                                        self.data.append(tweet.tweet)
+                                        added_tweets += 1
+                                        self.progress.print_progress(len(self.data))
+
+                                        if len(self.data) >= self.max_tweets:
+                                            self.scroller.scrolling = False
+                                            break
+                                    else:
+                                        continue
+                                else:
+                                    continue
+                            else:
+                                continue
+                        else:
+                            continue
+                    except NoSuchElementException:
                         continue
 
-                    if tweet_id not in self.tweet_ids:
-                        self.tweet_ids.add(tweet_id)
-                        if tweet:
-                            if not tweet.is_ad:
-                                self.data.append(tweet.tweet)
-                                added_tweets += 1
-                                self.progress.print_progress(len(self.data))
-
-                                if len(self.data) >= self.max_tweets:
-                                    self.scroller.scrolling = False
-                                    break
-
-                                if len(self.data) % 50 == 0:
-                                    sleep(2)
-
                 if len(self.data) >= self.max_tweets:
                     break
 
                 if added_tweets == 0:
-                    refresh_count += 1
-                    if refresh_count >= 10:
-                        print()
-                        print("No more tweets to scrape")
-                        break
-                else:
-                    refresh_count = 0
-
-                self.scroller.scroll_count = 0
-
-                while True:
-                    self.scroller.scroll_to_bottom()
-                    sleep(2)
-                    self.scroller.update_scroll_position()
-
-                    if self.scroller.last_position == self.scroller.current_position:
-                        self.scroller.scroll_count += 1
-
-                        if self.scroller.scroll_count >= 3:
-                            router()
-                            sleep(2)
+                    if empty_count >= 5:
+                        if refresh_count >= 3:
+                            print()
+                            print("No more tweets to scrape")
                             break
-                        else:
-                            sleep(1)
-                    else:
-                        self.scroller.last_position = self.scroller.current_position
-                        break
+                        refresh_count += 1
+                    empty_count += 1
+                    sleep(1)
+                else:
+                    empty_count = 0
+                    refresh_count = 0
             except StaleElementReferenceException:
-                router()
                 sleep(2)
+                continue
+            except KeyboardInterrupt:
+                print("\n")
+                print("Keyboard Interrupt")
+                self.interrupted = True
+                break
             except Exception as e:
                 print("\n")
                 print(f"Error scraping tweets: {e}")
@@ -468,6 +504,10 @@ It may be due to the following:
             "Profile Image": [tweet[12] for tweet in self.data],
         }
 
+        if self.scraper_details["poster_details"]:
+            data["Following"] = [tweet[13] for tweet in self.data]
+            data["Followers"] = [tweet[14] for tweet in self.data]
+
         df = pd.DataFrame(data)
 
         current_time = now.strftime("%Y-%m-%d_%H-%M-%S")