From 0a1c607e55fa8c84172aaa9dab7f75500a8e3843 Mon Sep 17 00:00:00 2001
From: MagiPrince <david.nogueiras@outlook.com>
Date: Fri, 22 Mar 2024 15:05:04 +0100
Subject: [PATCH 1/3] adding the twitter username utilisation if asked, and the
 possibility of scrap tweets with no limits (even tho the limitation will be
 the numbers of tweets that the browser is able to handle)

---
 README.md                  |  4 ++
 scraper/__main__.py        | 18 ++++++++
 scraper/progress.py        | 33 ++++++++++++---
 scraper/twitter_scraper.py | 84 ++++++++++++++++++++++++++++++--------
 4 files changed, 117 insertions(+), 22 deletions(-)

diff --git a/README.md b/README.md
index cc597ce..8d6c068 100644
--- a/README.md
+++ b/README.md
@@ -18,6 +18,7 @@ pip install -r requirements.txt
 
 ```bash
 TWITTER_USERNAME=# Your Twitter Handle (e.g. @username)
+TWITTER_USERNAME=# Your Twitter Username
 TWITTER_PASSWORD=# Your Twitter Password
 ```
 
@@ -131,6 +132,9 @@ options:                description
                           and query-based scraping.
                           usage:
                             python scraper -t 500 -ht=python --top
+
+-ntl, --no_tweets_limit : Set no limit to the number of tweets to scrape
+                          (will scrap until no more tweets are available).
 ```
 
 ### Sample Scraping Commands
diff --git a/scraper/__main__.py b/scraper/__main__.py
index 429ecf4..7676189 100644
--- a/scraper/__main__.py
+++ b/scraper/__main__.py
@@ -24,6 +24,13 @@ def main():
         )
 
         try:
+            parser.add_argument(
+                "--mail",
+                type=str,
+                default=os.getenv("TWITTER_MAIL"),
+                help="Your Twitter mail.",
+            )
+
             parser.add_argument(
                 "--user",
                 type=str,
@@ -65,6 +72,14 @@ def main():
             help="Twitter hashtag. Scrape tweets from a hashtag.",
         )
 
+        parser.add_argument(
+            "-ntl",
+            "--no_tweets_limit",
+            nargs='?',
+            default=False,
+            help="Set no limit to the number of tweets to scrape (will scrap until no more tweets are available).",
+        )
+
         parser.add_argument(
             "-q",
             "--query",
@@ -95,6 +110,7 @@ def main():
 
         args = parser.parse_args()
 
+        USER_MAIL = args.mail
         USER_UNAME = args.user
         USER_PASSWORD = args.password
 
@@ -127,12 +143,14 @@ def main():
 
         if USER_UNAME is not None and USER_PASSWORD is not None:
             scraper = Twitter_Scraper(
+                mail=USER_MAIL,
                 username=USER_UNAME,
                 password=USER_PASSWORD,
             )
             scraper.login()
             scraper.scrape_tweets(
                 max_tweets=args.tweets,
+                no_tweets_limit= args.no_tweets_limit if args.no_tweets_limit is not None else True,
                 scrape_username=args.username,
                 scrape_hashtag=args.hashtag,
                 scrape_query=args.query,
diff --git a/scraper/progress.py b/scraper/progress.py
index 9af9bd4..71e6783 100644
--- a/scraper/progress.py
+++ b/scraper/progress.py
@@ -7,7 +7,7 @@ class Progress:
         self.total = total
         pass
 
-    def print_progress(self, current) -> None:
+    def print_progress(self, current, waiting, retry_cnt, no_tweets_limit) -> None:
         self.current = current
         progress = current / self.total
         bar_length = 40
@@ -17,9 +17,30 @@ class Progress:
             + "-" * (bar_length - int(bar_length * progress))
             + "]"
         )
-        sys.stdout.write(
-            "\rProgress: [{:<40}] {:.2%} {} of {}".format(
-                progress_bar, progress, current, self.total
-            )
-        )
+        if no_tweets_limit:
+            if waiting:
+                sys.stdout.write(
+                    "\rTweets scrapped : {} - waiting to access older tweets {} min on 15 min".format(
+                        current, retry_cnt
+                    )
+                )
+            else:
+                sys.stdout.write(
+                    "\rTweets scrapped : {}                                                  ".format(
+                        current
+                    )
+                )
+        else:
+            if waiting:
+                sys.stdout.write(
+                    "\rProgress: [{:<40}] {:.2%} {} of {} - waiting to access older tweets {} min on 15 min".format(
+                        progress_bar, progress, current, self.total, retry_cnt
+                    )
+                )
+            else:
+                sys.stdout.write(
+                    "\rProgress: [{:<40}] {:.2%} {} of {}                                                  ".format(
+                        progress_bar, progress, current, self.total
+                    )
+                )
         sys.stdout.flush()
diff --git a/scraper/twitter_scraper.py b/scraper/twitter_scraper.py
index dc2b63a..1a06749 100644
--- a/scraper/twitter_scraper.py
+++ b/scraper/twitter_scraper.py
@@ -20,7 +20,13 @@ from selenium.webdriver.common.action_chains import ActionChains
 from selenium.webdriver.chrome.options import Options as ChromeOptions
 from selenium.webdriver.chrome.service import Service as ChromeService
 
+from selenium.webdriver.firefox.options import Options as FirefoxOptions
+from selenium.webdriver.firefox.service import Service as FirefoxService
+
+from selenium.webdriver.support.ui import WebDriverWait
+
 from webdriver_manager.chrome import ChromeDriverManager
+from webdriver_manager.firefox import GeckoDriverManager
 
 TWITTER_LOGIN_URL = "https://twitter.com/i/flow/login"
 
@@ -28,6 +34,7 @@ TWITTER_LOGIN_URL = "https://twitter.com/i/flow/login"
 class Twitter_Scraper:
     def __init__(
         self,
+        mail,
         username,
         password,
         max_tweets=50,
@@ -39,6 +46,7 @@ class Twitter_Scraper:
         scrape_top=False,
     ):
         print("Initializing Twitter Scraper...")
+        self.mail = mail
         self.username = username
         self.password = password
         self.interrupted = False
@@ -115,7 +123,8 @@ class Twitter_Scraper:
         print("Setup WebDriver...")
         header = Headers().generate()["User-Agent"]
 
-        browser_option = ChromeOptions()
+        # browser_option = ChromeOptions()
+        browser_option = FirefoxOptions()
         browser_option.add_argument("--no-sandbox")
         browser_option.add_argument("--disable-dev-shm-usage")
         browser_option.add_argument("--ignore-certificate-errors")
@@ -129,8 +138,13 @@ class Twitter_Scraper:
         browser_option.add_argument("--headless")
 
         try:
-            print("Initializing ChromeDriver...")
-            driver = webdriver.Chrome(
+            # print("Initializing ChromeDriver...")
+            # driver = webdriver.Chrome(
+            #     options=browser_option,
+            # )
+
+            print("Initializing FirefoxDriver...")
+            driver = webdriver.Firefox(
                 options=browser_option,
             )
 
@@ -138,13 +152,23 @@ class Twitter_Scraper:
             return driver
         except WebDriverException:
             try:
-                print("Downloading ChromeDriver...")
-                chromedriver_path = ChromeDriverManager().install()
-                chrome_service = ChromeService(executable_path=chromedriver_path)
+                # print("Downloading ChromeDriver...")
+                # chromedriver_path = ChromeDriverManager().install()
+                # chrome_service = ChromeService(executable_path=chromedriver_path)
 
-                print("Initializing ChromeDriver...")
-                driver = webdriver.Chrome(
-                    service=chrome_service,
+                print("Downloading FirefoxDriver...")
+                firefoxdriver_path = GeckoDriverManager().install()
+                firefox_service = FirefoxService(executable_path=firefoxdriver_path)
+
+                # print("Initializing ChromeDriver...")
+                # driver = webdriver.Chrome(
+                #     service=chrome_service,
+                #     options=browser_option,
+                # )
+
+                print("Initializing FirefoxDriver...")
+                driver = webdriver.Firefox(
+                    service=firefox_service,
                     options=browser_option,
                 )
 
@@ -206,7 +230,7 @@ class Twitter_Scraper:
                     "xpath", "//input[@autocomplete='username']"
                 )
 
-                username.send_keys(self.username)
+                username.send_keys(self.mail)
                 username.send_keys(Keys.RETURN)
                 sleep(3)
                 break
@@ -315,10 +339,12 @@ It may be due to the following:
             sys.exit(1)
         else:
             url = f"https://twitter.com/search?q={self.scraper_details['query']}&src=typed_query"
+            print(url)
             if self.scraper_details["tab"] == "Latest":
                 url += "&f=live"
 
             self.driver.get(url)
+            self.driver.save_screenshot('screenshot5.png')
             sleep(3)
         pass
 
@@ -345,6 +371,7 @@ It may be due to the following:
     def scrape_tweets(
         self,
         max_tweets=50,
+        no_tweets_limit=False,
         scrape_username=None,
         scrape_hashtag=None,
         scrape_query=None,
@@ -387,11 +414,20 @@ It may be due to the following:
         elif self.scraper_details["type"] == "Home":
             print("Scraping Tweets from Home...")
 
-        self.progress.print_progress(0)
+        # Accept cookies to make the banner disappear
+        try:
+            accept_cookies_btn = self.driver.find_element(
+            "xpath", "//span[text()='Refuse non-essential cookies']/../../..")
+            accept_cookies_btn.click()
+        except NoSuchElementException:
+            pass
+
+        self.progress.print_progress(0, False, 0, no_tweets_limit)
 
         refresh_count = 0
         added_tweets = 0
         empty_count = 0
+        retry_cnt = 0
 
         while self.scroller.scrolling:
             try:
@@ -424,9 +460,9 @@ It may be due to the following:
                                     if not tweet.is_ad:
                                         self.data.append(tweet.tweet)
                                         added_tweets += 1
-                                        self.progress.print_progress(len(self.data))
+                                        self.progress.print_progress(len(self.data), False, 0, no_tweets_limit)
 
-                                        if len(self.data) >= self.max_tweets:
+                                        if len(self.data) >= self.max_tweets and not no_tweets_limit:
                                             self.scroller.scrolling = False
                                             break
                                     else:
@@ -440,10 +476,25 @@ It may be due to the following:
                     except NoSuchElementException:
                         continue
 
-                if len(self.data) >= self.max_tweets:
+                if len(self.data) >= self.max_tweets and not no_tweets_limit:
                     break
 
                 if added_tweets == 0:
+                    # Check if there is a button "Retry" and click on it with a regular basis until a certain amount of tries
+                    try:
+                        while retry_cnt < 15:
+                            retry_button = self.driver.find_element(
+                            "xpath", "//span[text()='Retry']/../../..")
+                            self.progress.print_progress(len(self.data), True, retry_cnt, no_tweets_limit)
+                            sleep(58)
+                            retry_button.click()
+                            retry_cnt += 1
+                            sleep(2)
+                    # There is no Retry button so the counter is reseted
+                    except NoSuchElementException:
+                        retry_cnt = 0
+                        self.progress.print_progress(len(self.data), False, 0, no_tweets_limit)
+
                     if empty_count >= 5:
                         if refresh_count >= 3:
                             print()
@@ -470,12 +521,13 @@ It may be due to the following:
 
         print("")
 
-        if len(self.data) >= self.max_tweets:
+        if len(self.data) >= self.max_tweets or no_tweets_limit:
             print("Scraping Complete")
         else:
             print("Scraping Incomplete")
 
-        print("Tweets: {} out of {}\n".format(len(self.data), self.max_tweets))
+        if not no_tweets_limit:
+            print("Tweets: {} out of {}\n".format(len(self.data), self.max_tweets))
 
         pass
 

From cf048244f322785071a79a4d40e8d55bb257d9c6 Mon Sep 17 00:00:00 2001
From: MagiPrince <david.nogueiras@outlook.com>
Date: Fri, 22 Mar 2024 15:26:10 +0100
Subject: [PATCH 2/3] removing function doing screenshot

---
 scraper/twitter_scraper.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scraper/twitter_scraper.py b/scraper/twitter_scraper.py
index 1a06749..131f715 100644
--- a/scraper/twitter_scraper.py
+++ b/scraper/twitter_scraper.py
@@ -344,7 +344,6 @@ It may be due to the following:
                 url += "&f=live"
 
             self.driver.get(url)
-            self.driver.save_screenshot('screenshot5.png')
             sleep(3)
         pass
 

From 91539fcdd727ca8a2cba5be92e8216a589da6654 Mon Sep 17 00:00:00 2001
From: MagiPrince <david.nogueiras@outlook.com>
Date: Fri, 22 Mar 2024 15:29:57 +0100
Subject: [PATCH 3/3] removing print for debug

---
 scraper/twitter_scraper.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scraper/twitter_scraper.py b/scraper/twitter_scraper.py
index 131f715..f994be5 100644
--- a/scraper/twitter_scraper.py
+++ b/scraper/twitter_scraper.py
@@ -339,7 +339,6 @@ It may be due to the following:
             sys.exit(1)
         else:
             url = f"https://twitter.com/search?q={self.scraper_details['query']}&src=typed_query"
-            print(url)
             if self.scraper_details["tab"] == "Latest":
                 url += "&f=live"