diff --git a/README.md b/README.md index cc597ce..8d6c068 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ pip install -r requirements.txt ```bash TWITTER_USERNAME=# Your Twitter Handle (e.g. @username) +TWITTER_USERNAME=# Your Twitter Username TWITTER_PASSWORD=# Your Twitter Password ``` @@ -131,6 +132,9 @@ options: description and query-based scraping. usage: python scraper -t 500 -ht=python --top + +-ntl, --no_tweets_limit : Set no limit to the number of tweets to scrape + (will scrap until no more tweets are available). ``` ### Sample Scraping Commands diff --git a/scraper/__main__.py b/scraper/__main__.py index 429ecf4..7676189 100644 --- a/scraper/__main__.py +++ b/scraper/__main__.py @@ -24,6 +24,13 @@ def main(): ) try: + parser.add_argument( + "--mail", + type=str, + default=os.getenv("TWITTER_MAIL"), + help="Your Twitter mail.", + ) + parser.add_argument( "--user", type=str, @@ -65,6 +72,14 @@ def main(): help="Twitter hashtag. Scrape tweets from a hashtag.", ) + parser.add_argument( + "-ntl", + "--no_tweets_limit", + nargs='?', + default=False, + help="Set no limit to the number of tweets to scrape (will scrap until no more tweets are available).", + ) + parser.add_argument( "-q", "--query", @@ -95,6 +110,7 @@ def main(): args = parser.parse_args() + USER_MAIL = args.mail USER_UNAME = args.user USER_PASSWORD = args.password @@ -127,12 +143,14 @@ def main(): if USER_UNAME is not None and USER_PASSWORD is not None: scraper = Twitter_Scraper( + mail=USER_MAIL, username=USER_UNAME, password=USER_PASSWORD, ) scraper.login() scraper.scrape_tweets( max_tweets=args.tweets, + no_tweets_limit= args.no_tweets_limit if args.no_tweets_limit is not None else True, scrape_username=args.username, scrape_hashtag=args.hashtag, scrape_query=args.query, diff --git a/scraper/progress.py b/scraper/progress.py index 9af9bd4..71e6783 100644 --- a/scraper/progress.py +++ b/scraper/progress.py @@ -7,7 +7,7 @@ class Progress: self.total = total pass - def print_progress(self, current) -> None: + def print_progress(self, current, waiting, retry_cnt, no_tweets_limit) -> None: self.current = current progress = current / self.total bar_length = 40 @@ -17,9 +17,30 @@ class Progress: + "-" * (bar_length - int(bar_length * progress)) + "]" ) - sys.stdout.write( - "\rProgress: [{:<40}] {:.2%} {} of {}".format( - progress_bar, progress, current, self.total - ) - ) + if no_tweets_limit: + if waiting: + sys.stdout.write( + "\rTweets scrapped : {} - waiting to access older tweets {} min on 15 min".format( + current, retry_cnt + ) + ) + else: + sys.stdout.write( + "\rTweets scrapped : {} ".format( + current + ) + ) + else: + if waiting: + sys.stdout.write( + "\rProgress: [{:<40}] {:.2%} {} of {} - waiting to access older tweets {} min on 15 min".format( + progress_bar, progress, current, self.total, retry_cnt + ) + ) + else: + sys.stdout.write( + "\rProgress: [{:<40}] {:.2%} {} of {} ".format( + progress_bar, progress, current, self.total + ) + ) sys.stdout.flush() diff --git a/scraper/twitter_scraper.py b/scraper/twitter_scraper.py index dc2b63a..1a06749 100644 --- a/scraper/twitter_scraper.py +++ b/scraper/twitter_scraper.py @@ -20,7 +20,13 @@ from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.chrome.options import Options as ChromeOptions from selenium.webdriver.chrome.service import Service as ChromeService +from selenium.webdriver.firefox.options import Options as FirefoxOptions +from selenium.webdriver.firefox.service import Service as FirefoxService + +from selenium.webdriver.support.ui import WebDriverWait + from webdriver_manager.chrome import ChromeDriverManager +from webdriver_manager.firefox import GeckoDriverManager TWITTER_LOGIN_URL = "https://twitter.com/i/flow/login" @@ -28,6 +34,7 @@ TWITTER_LOGIN_URL = "https://twitter.com/i/flow/login" class Twitter_Scraper: def __init__( self, + mail, username, password, max_tweets=50, @@ -39,6 +46,7 @@ class Twitter_Scraper: scrape_top=False, ): print("Initializing Twitter Scraper...") + self.mail = mail self.username = username self.password = password self.interrupted = False @@ -115,7 +123,8 @@ class Twitter_Scraper: print("Setup WebDriver...") header = Headers().generate()["User-Agent"] - browser_option = ChromeOptions() + # browser_option = ChromeOptions() + browser_option = FirefoxOptions() browser_option.add_argument("--no-sandbox") browser_option.add_argument("--disable-dev-shm-usage") browser_option.add_argument("--ignore-certificate-errors") @@ -129,8 +138,13 @@ class Twitter_Scraper: browser_option.add_argument("--headless") try: - print("Initializing ChromeDriver...") - driver = webdriver.Chrome( + # print("Initializing ChromeDriver...") + # driver = webdriver.Chrome( + # options=browser_option, + # ) + + print("Initializing FirefoxDriver...") + driver = webdriver.Firefox( options=browser_option, ) @@ -138,13 +152,23 @@ class Twitter_Scraper: return driver except WebDriverException: try: - print("Downloading ChromeDriver...") - chromedriver_path = ChromeDriverManager().install() - chrome_service = ChromeService(executable_path=chromedriver_path) + # print("Downloading ChromeDriver...") + # chromedriver_path = ChromeDriverManager().install() + # chrome_service = ChromeService(executable_path=chromedriver_path) - print("Initializing ChromeDriver...") - driver = webdriver.Chrome( - service=chrome_service, + print("Downloading FirefoxDriver...") + firefoxdriver_path = GeckoDriverManager().install() + firefox_service = FirefoxService(executable_path=firefoxdriver_path) + + # print("Initializing ChromeDriver...") + # driver = webdriver.Chrome( + # service=chrome_service, + # options=browser_option, + # ) + + print("Initializing FirefoxDriver...") + driver = webdriver.Firefox( + service=firefox_service, options=browser_option, ) @@ -206,7 +230,7 @@ class Twitter_Scraper: "xpath", "//input[@autocomplete='username']" ) - username.send_keys(self.username) + username.send_keys(self.mail) username.send_keys(Keys.RETURN) sleep(3) break @@ -315,10 +339,12 @@ It may be due to the following: sys.exit(1) else: url = f"https://twitter.com/search?q={self.scraper_details['query']}&src=typed_query" + print(url) if self.scraper_details["tab"] == "Latest": url += "&f=live" self.driver.get(url) + self.driver.save_screenshot('screenshot5.png') sleep(3) pass @@ -345,6 +371,7 @@ It may be due to the following: def scrape_tweets( self, max_tweets=50, + no_tweets_limit=False, scrape_username=None, scrape_hashtag=None, scrape_query=None, @@ -387,11 +414,20 @@ It may be due to the following: elif self.scraper_details["type"] == "Home": print("Scraping Tweets from Home...") - self.progress.print_progress(0) + # Accept cookies to make the banner disappear + try: + accept_cookies_btn = self.driver.find_element( + "xpath", "//span[text()='Refuse non-essential cookies']/../../..") + accept_cookies_btn.click() + except NoSuchElementException: + pass + + self.progress.print_progress(0, False, 0, no_tweets_limit) refresh_count = 0 added_tweets = 0 empty_count = 0 + retry_cnt = 0 while self.scroller.scrolling: try: @@ -424,9 +460,9 @@ It may be due to the following: if not tweet.is_ad: self.data.append(tweet.tweet) added_tweets += 1 - self.progress.print_progress(len(self.data)) + self.progress.print_progress(len(self.data), False, 0, no_tweets_limit) - if len(self.data) >= self.max_tweets: + if len(self.data) >= self.max_tweets and not no_tweets_limit: self.scroller.scrolling = False break else: @@ -440,10 +476,25 @@ It may be due to the following: except NoSuchElementException: continue - if len(self.data) >= self.max_tweets: + if len(self.data) >= self.max_tweets and not no_tweets_limit: break if added_tweets == 0: + # Check if there is a button "Retry" and click on it with a regular basis until a certain amount of tries + try: + while retry_cnt < 15: + retry_button = self.driver.find_element( + "xpath", "//span[text()='Retry']/../../..") + self.progress.print_progress(len(self.data), True, retry_cnt, no_tweets_limit) + sleep(58) + retry_button.click() + retry_cnt += 1 + sleep(2) + # There is no Retry button so the counter is reseted + except NoSuchElementException: + retry_cnt = 0 + self.progress.print_progress(len(self.data), False, 0, no_tweets_limit) + if empty_count >= 5: if refresh_count >= 3: print() @@ -470,12 +521,13 @@ It may be due to the following: print("") - if len(self.data) >= self.max_tweets: + if len(self.data) >= self.max_tweets or no_tweets_limit: print("Scraping Complete") else: print("Scraping Incomplete") - print("Tweets: {} out of {}\n".format(len(self.data), self.max_tweets)) + if not no_tweets_limit: + print("Tweets: {} out of {}\n".format(len(self.data), self.max_tweets)) pass