Merge pull request #9 from MagiPrince/master

2024-04-09 07:19:29 +08:00
parent ac2f8cc600 91539fcdd7
commit cb07c91b62
4 changed files with 115 additions and 22 deletions
--- a/README.md
+++ b/README.md
@@ -18,6 +18,7 @@ pip install -r requirements.txt
 ```bash
 TWITTER_USERNAME=# Your Twitter Handle (e.g. @username)
 TWITTER_USERNAME=# Your Twitter Username
 TWITTER_PASSWORD=# Your Twitter Password
 ```
@@ -131,6 +132,9 @@ options:                description
                          and query-based scraping.
                          usage:
                            python scraper -t 500 -ht=python --top
 -ntl, --no_tweets_limit : Set no limit to the number of tweets to scrape
                          (will scrap until no more tweets are available).
 ```
 ### Sample Scraping Commands
--- a/scraper/main.py
+++ b/scraper/main.py
@@ -24,6 +24,13 @@ def main():
        )
        try:
            parser.add_argument(
                "--mail",
                type=str,
                default=os.getenv("TWITTER_MAIL"),
                help="Your Twitter mail.",
            )
            parser.add_argument(
                "--user",
                type=str,
@@ -65,6 +72,14 @@ def main():
            help="Twitter hashtag. Scrape tweets from a hashtag.",
        )
        parser.add_argument(
            "-ntl",
            "--no_tweets_limit",
            nargs='?',
            default=False,
            help="Set no limit to the number of tweets to scrape (will scrap until no more tweets are available).",
        )
        parser.add_argument(
            "-q",
            "--query",
@@ -95,6 +110,7 @@ def main():
        args = parser.parse_args()
        USER_MAIL = args.mail
        USER_UNAME = args.user
        USER_PASSWORD = args.password
@@ -127,12 +143,14 @@ def main():
        if USER_UNAME is not None and USER_PASSWORD is not None:
            scraper = Twitter_Scraper(
                mail=USER_MAIL,
                username=USER_UNAME,
                password=USER_PASSWORD,
            )
            scraper.login()
            scraper.scrape_tweets(
                max_tweets=args.tweets,
                no_tweets_limit= args.no_tweets_limit if args.no_tweets_limit is not None else True,
                scrape_username=args.username,
                scrape_hashtag=args.hashtag,
                scrape_query=args.query,
--- a/scraper/progress.py
+++ b/scraper/progress.py
@@ -7,7 +7,7 @@ class Progress:
        self.total = total
        pass
-    def print_progress(self, current) -> None:
+    def print_progress(self, current, waiting, retry_cnt, no_tweets_limit) -> None:
        self.current = current
        progress = current / self.total
        bar_length = 40
@@ -17,6 +17,27 @@ class Progress:
            + "-" * (bar_length - int(bar_length * progress))
            + "]"
        )
        if no_tweets_limit:
            if waiting:
                sys.stdout.write(
                    "\rTweets scrapped : {} - waiting to access older tweets {} min on 15 min".format(
                        current, retry_cnt
                    )
                )
            else:
                sys.stdout.write(
                    "\rTweets scrapped : {}                                                  ".format(
                        current
                    )
                )
        else:
            if waiting:
                sys.stdout.write(
                    "\rProgress: [{:<40}] {:.2%} {} of {} - waiting to access older tweets {} min on 15 min".format(
                        progress_bar, progress, current, self.total, retry_cnt
                    )
                )
            else:
                sys.stdout.write(
                    "\rProgress: [{:<40}] {:.2%} {} of {}                                                  ".format(
                        progress_bar, progress, current, self.total
--- a/scraper/twitter_scraper.py
+++ b/scraper/twitter_scraper.py
@@ -20,7 +20,13 @@ from selenium.webdriver.common.action_chains import ActionChains
 from selenium.webdriver.chrome.options import Options as ChromeOptions
 from selenium.webdriver.chrome.service import Service as ChromeService
 from selenium.webdriver.firefox.options import Options as FirefoxOptions
 from selenium.webdriver.firefox.service import Service as FirefoxService
 from selenium.webdriver.support.ui import WebDriverWait
 from webdriver_manager.chrome import ChromeDriverManager
 from webdriver_manager.firefox import GeckoDriverManager
 TWITTER_LOGIN_URL = "https://twitter.com/i/flow/login"
@@ -28,6 +34,7 @@ TWITTER_LOGIN_URL = "https://twitter.com/i/flow/login"
 class Twitter_Scraper:
    def __init__(
        self,
        mail,
        username,
        password,
        max_tweets=50,
@@ -39,6 +46,7 @@ class Twitter_Scraper:
        scrape_top=False,
    ):
        print("Initializing Twitter Scraper...")
        self.mail = mail
        self.username = username
        self.password = password
        self.interrupted = False
@@ -115,7 +123,8 @@ class Twitter_Scraper:
        print("Setup WebDriver...")
        header = Headers().generate()["User-Agent"]
-        browser_option = ChromeOptions()
+        # browser_option = ChromeOptions()
        browser_option = FirefoxOptions()
        browser_option.add_argument("--no-sandbox")
        browser_option.add_argument("--disable-dev-shm-usage")
        browser_option.add_argument("--ignore-certificate-errors")
@@ -129,8 +138,13 @@ class Twitter_Scraper:
        browser_option.add_argument("--headless")
        try:
-            print("Initializing ChromeDriver...")
+            # print("Initializing ChromeDriver...")
-            driver = webdriver.Chrome(
+            # driver = webdriver.Chrome(
            #     options=browser_option,
            # )
            print("Initializing FirefoxDriver...")
            driver = webdriver.Firefox(
                options=browser_option,
            )
@@ -138,13 +152,23 @@ class Twitter_Scraper:
            return driver
        except WebDriverException:
            try:
-                print("Downloading ChromeDriver...")
+                # print("Downloading ChromeDriver...")
-                chromedriver_path = ChromeDriverManager().install()
+                # chromedriver_path = ChromeDriverManager().install()
-                chrome_service = ChromeService(executable_path=chromedriver_path)
+                # chrome_service = ChromeService(executable_path=chromedriver_path)
-                print("Initializing ChromeDriver...")
+                print("Downloading FirefoxDriver...")
-                driver = webdriver.Chrome(
+                firefoxdriver_path = GeckoDriverManager().install()
-                    service=chrome_service,
+                firefox_service = FirefoxService(executable_path=firefoxdriver_path)
                # print("Initializing ChromeDriver...")
                # driver = webdriver.Chrome(
                #     service=chrome_service,
                #     options=browser_option,
                # )
                print("Initializing FirefoxDriver...")
                driver = webdriver.Firefox(
                    service=firefox_service,
                    options=browser_option,
                )
@@ -206,7 +230,7 @@ class Twitter_Scraper:
                    "xpath", "//input[@autocomplete='username']"
                )
-                username.send_keys(self.username)
+                username.send_keys(self.mail)
                username.send_keys(Keys.RETURN)
                sleep(3)
                break
@@ -345,6 +369,7 @@ It may be due to the following:
    def scrape_tweets(
        self,
        max_tweets=50,
        no_tweets_limit=False,
        scrape_username=None,
        scrape_hashtag=None,
        scrape_query=None,
@@ -387,11 +412,20 @@ It may be due to the following:
        elif self.scraper_details["type"] == "Home":
            print("Scraping Tweets from Home...")
-        self.progress.print_progress(0)
+        # Accept cookies to make the banner disappear
        try:
            accept_cookies_btn = self.driver.find_element(
            "xpath", "//span[text()='Refuse non-essential cookies']/../../..")
            accept_cookies_btn.click()
        except NoSuchElementException:
            pass
        self.progress.print_progress(0, False, 0, no_tweets_limit)
        refresh_count = 0
        added_tweets = 0
        empty_count = 0
        retry_cnt = 0
        while self.scroller.scrolling:
            try:
@@ -424,9 +458,9 @@ It may be due to the following:
                                    if not tweet.is_ad:
                                        self.data.append(tweet.tweet)
                                        added_tweets += 1
-                                        self.progress.print_progress(len(self.data))
+                                        self.progress.print_progress(len(self.data), False, 0, no_tweets_limit)
-                                        if len(self.data) >= self.max_tweets:
+                                        if len(self.data) >= self.max_tweets and not no_tweets_limit:
                                            self.scroller.scrolling = False
                                            break
                                    else:
@@ -440,10 +474,25 @@ It may be due to the following:
                    except NoSuchElementException:
                        continue
-                if len(self.data) >= self.max_tweets:
+                if len(self.data) >= self.max_tweets and not no_tweets_limit:
                    break
                if added_tweets == 0:
                    # Check if there is a button "Retry" and click on it with a regular basis until a certain amount of tries
                    try:
                        while retry_cnt < 15:
                            retry_button = self.driver.find_element(
                            "xpath", "//span[text()='Retry']/../../..")
                            self.progress.print_progress(len(self.data), True, retry_cnt, no_tweets_limit)
                            sleep(58)
                            retry_button.click()
                            retry_cnt += 1
                            sleep(2)
                    # There is no Retry button so the counter is reseted
                    except NoSuchElementException:
                        retry_cnt = 0
                        self.progress.print_progress(len(self.data), False, 0, no_tweets_limit)
                    if empty_count >= 5:
                        if refresh_count >= 3:
                            print()
@@ -470,11 +519,12 @@ It may be due to the following:
        print("")
-        if len(self.data) >= self.max_tweets:
+        if len(self.data) >= self.max_tweets or no_tweets_limit:
            print("Scraping Complete")
        else:
            print("Scraping Incomplete")
        if not no_tweets_limit:
            print("Tweets: {} out of {}\n".format(len(self.data), self.max_tweets))
        pass