Merge pull request #9 from MagiPrince/master

2024-04-09 07:19:29 +08:00
parent ac2f8cc600 91539fcdd7
commit cb07c91b62
4 changed files with 115 additions and 22 deletions
--- a/README.md
+++ b/README.md
@@ -18,6 +18,7 @@ pip install -r requirements.txt

 ```bash
 TWITTER_USERNAME=# Your Twitter Handle (e.g. @username)
+TWITTER_USERNAME=# Your Twitter Username
 TWITTER_PASSWORD=# Your Twitter Password
 ```

@@ -131,6 +132,9 @@ options:                description
                          and query-based scraping.
                          usage:
                            python scraper -t 500 -ht=python --top
+
+-ntl, --no_tweets_limit : Set no limit to the number of tweets to scrape
+                          (will scrap until no more tweets are available).
 ```

 ### Sample Scraping Commands
--- a/scraper/main.py
+++ b/scraper/main.py
@@ -24,6 +24,13 @@ def main():
        )

        try:
+            parser.add_argument(
+                "--mail",
+                type=str,
+                default=os.getenv("TWITTER_MAIL"),
+                help="Your Twitter mail.",
+            )
+
            parser.add_argument(
                "--user",
                type=str,
@@ -65,6 +72,14 @@ def main():
            help="Twitter hashtag. Scrape tweets from a hashtag.",
        )

+        parser.add_argument(
+            "-ntl",
+            "--no_tweets_limit",
+            nargs='?',
+            default=False,
+            help="Set no limit to the number of tweets to scrape (will scrap until no more tweets are available).",
+        )
+
        parser.add_argument(
            "-q",
            "--query",
@@ -95,6 +110,7 @@ def main():

        args = parser.parse_args()

+        USER_MAIL = args.mail
        USER_UNAME = args.user
        USER_PASSWORD = args.password

@@ -127,12 +143,14 @@ def main():

        if USER_UNAME is not None and USER_PASSWORD is not None:
            scraper = Twitter_Scraper(
+                mail=USER_MAIL,
                username=USER_UNAME,
                password=USER_PASSWORD,
            )
            scraper.login()
            scraper.scrape_tweets(
                max_tweets=args.tweets,
+                no_tweets_limit= args.no_tweets_limit if args.no_tweets_limit is not None else True,
                scrape_username=args.username,
                scrape_hashtag=args.hashtag,
                scrape_query=args.query,
--- a/scraper/progress.py
+++ b/scraper/progress.py
@@ -7,7 +7,7 @@ class Progress:
        self.total = total
        pass

-    def print_progress(self, current) -> None:
+    def print_progress(self, current, waiting, retry_cnt, no_tweets_limit) -> None:
        self.current = current
        progress = current / self.total
        bar_length = 40
@@ -17,8 +17,29 @@ class Progress:
            + "-" * (bar_length - int(bar_length * progress))
            + "]"
        )
+        if no_tweets_limit:
+            if waiting:
                sys.stdout.write(
-            "\rProgress: [{:<40}] {:.2%} {} of {}".format(
+                    "\rTweets scrapped : {} - waiting to access older tweets {} min on 15 min".format(
+                        current, retry_cnt
+                    )
+                )
+            else:
+                sys.stdout.write(
+                    "\rTweets scrapped : {}                                                  ".format(
+                        current
+                    )
+                )
+        else:
+            if waiting:
+                sys.stdout.write(
+                    "\rProgress: [{:<40}] {:.2%} {} of {} - waiting to access older tweets {} min on 15 min".format(
+                        progress_bar, progress, current, self.total, retry_cnt
+                    )
+                )
+            else:
+                sys.stdout.write(
+                    "\rProgress: [{:<40}] {:.2%} {} of {}                                                  ".format(
                        progress_bar, progress, current, self.total
                    )
                )
--- a/scraper/twitter_scraper.py
+++ b/scraper/twitter_scraper.py
@@ -20,7 +20,13 @@ from selenium.webdriver.common.action_chains import ActionChains
 from selenium.webdriver.chrome.options import Options as ChromeOptions
 from selenium.webdriver.chrome.service import Service as ChromeService

+from selenium.webdriver.firefox.options import Options as FirefoxOptions
+from selenium.webdriver.firefox.service import Service as FirefoxService
+
+from selenium.webdriver.support.ui import WebDriverWait
+
 from webdriver_manager.chrome import ChromeDriverManager
+from webdriver_manager.firefox import GeckoDriverManager

 TWITTER_LOGIN_URL = "https://twitter.com/i/flow/login"

@@ -28,6 +34,7 @@ TWITTER_LOGIN_URL = "https://twitter.com/i/flow/login"
 class Twitter_Scraper:
    def __init__(
        self,
+        mail,
        username,
        password,
        max_tweets=50,
@@ -39,6 +46,7 @@ class Twitter_Scraper:
        scrape_top=False,
    ):
        print("Initializing Twitter Scraper...")
+        self.mail = mail
        self.username = username
        self.password = password
        self.interrupted = False
@@ -115,7 +123,8 @@ class Twitter_Scraper:
        print("Setup WebDriver...")
        header = Headers().generate()["User-Agent"]

-        browser_option = ChromeOptions()
+        # browser_option = ChromeOptions()
+        browser_option = FirefoxOptions()
        browser_option.add_argument("--no-sandbox")
        browser_option.add_argument("--disable-dev-shm-usage")
        browser_option.add_argument("--ignore-certificate-errors")
@@ -129,8 +138,13 @@ class Twitter_Scraper:
        browser_option.add_argument("--headless")

        try:
-            print("Initializing ChromeDriver...")
-            driver = webdriver.Chrome(
+            # print("Initializing ChromeDriver...")
+            # driver = webdriver.Chrome(
+            #     options=browser_option,
+            # )
+
+            print("Initializing FirefoxDriver...")
+            driver = webdriver.Firefox(
                options=browser_option,
            )

@@ -138,13 +152,23 @@ class Twitter_Scraper:
            return driver
        except WebDriverException:
            try:
-                print("Downloading ChromeDriver...")
-                chromedriver_path = ChromeDriverManager().install()
-                chrome_service = ChromeService(executable_path=chromedriver_path)
+                # print("Downloading ChromeDriver...")
+                # chromedriver_path = ChromeDriverManager().install()
+                # chrome_service = ChromeService(executable_path=chromedriver_path)

-                print("Initializing ChromeDriver...")
-                driver = webdriver.Chrome(
-                    service=chrome_service,
+                print("Downloading FirefoxDriver...")
+                firefoxdriver_path = GeckoDriverManager().install()
+                firefox_service = FirefoxService(executable_path=firefoxdriver_path)
+
+                # print("Initializing ChromeDriver...")
+                # driver = webdriver.Chrome(
+                #     service=chrome_service,
+                #     options=browser_option,
+                # )
+
+                print("Initializing FirefoxDriver...")
+                driver = webdriver.Firefox(
+                    service=firefox_service,
                    options=browser_option,
                )

@@ -206,7 +230,7 @@ class Twitter_Scraper:
                    "xpath", "//input[@autocomplete='username']"
                )

-                username.send_keys(self.username)
+                username.send_keys(self.mail)
                username.send_keys(Keys.RETURN)
                sleep(3)
                break
@@ -345,6 +369,7 @@ It may be due to the following:
    def scrape_tweets(
        self,
        max_tweets=50,
+        no_tweets_limit=False,
        scrape_username=None,
        scrape_hashtag=None,
        scrape_query=None,
@@ -387,11 +412,20 @@ It may be due to the following:
        elif self.scraper_details["type"] == "Home":
            print("Scraping Tweets from Home...")

-        self.progress.print_progress(0)
+        # Accept cookies to make the banner disappear
+        try:
+            accept_cookies_btn = self.driver.find_element(
+            "xpath", "//span[text()='Refuse non-essential cookies']/../../..")
+            accept_cookies_btn.click()
+        except NoSuchElementException:
+            pass
+
+        self.progress.print_progress(0, False, 0, no_tweets_limit)

        refresh_count = 0
        added_tweets = 0
        empty_count = 0
+        retry_cnt = 0

        while self.scroller.scrolling:
            try:
@@ -424,9 +458,9 @@ It may be due to the following:
                                    if not tweet.is_ad:
                                        self.data.append(tweet.tweet)
                                        added_tweets += 1
-                                        self.progress.print_progress(len(self.data))
+                                        self.progress.print_progress(len(self.data), False, 0, no_tweets_limit)

-                                        if len(self.data) >= self.max_tweets:
+                                        if len(self.data) >= self.max_tweets and not no_tweets_limit:
                                            self.scroller.scrolling = False
                                            break
                                    else:
@@ -440,10 +474,25 @@ It may be due to the following:
                    except NoSuchElementException:
                        continue

-                if len(self.data) >= self.max_tweets:
+                if len(self.data) >= self.max_tweets and not no_tweets_limit:
                    break

                if added_tweets == 0:
+                    # Check if there is a button "Retry" and click on it with a regular basis until a certain amount of tries
+                    try:
+                        while retry_cnt < 15:
+                            retry_button = self.driver.find_element(
+                            "xpath", "//span[text()='Retry']/../../..")
+                            self.progress.print_progress(len(self.data), True, retry_cnt, no_tweets_limit)
+                            sleep(58)
+                            retry_button.click()
+                            retry_cnt += 1
+                            sleep(2)
+                    # There is no Retry button so the counter is reseted
+                    except NoSuchElementException:
+                        retry_cnt = 0
+                        self.progress.print_progress(len(self.data), False, 0, no_tweets_limit)
+
                    if empty_count >= 5:
                        if refresh_count >= 3:
                            print()
@@ -470,11 +519,12 @@ It may be due to the following:

        print("")

-        if len(self.data) >= self.max_tweets:
+        if len(self.data) >= self.max_tweets or no_tweets_limit:
            print("Scraping Complete")
        else:
            print("Scraping Incomplete")

+        if not no_tweets_limit:
            print("Tweets: {} out of {}\n".format(len(self.data), self.max_tweets))

        pass