add error handling and script runs until unexpected error or max reached

This commit is contained in:
Jarrian
2023-09-09 14:19:13 +08:00
parent 8843b4d81e
commit b43fb72dbd
5 changed files with 142 additions and 127 deletions

View File

@@ -1,6 +1,7 @@
import os import os
import sys import sys
import argparse import argparse
import getpass
from twitter_scraper import Twitter_Scraper from twitter_scraper import Twitter_Scraper
try: try:
@@ -16,37 +17,49 @@ except Exception as e:
def main(): def main():
try: try:
USER_UNAME = os.getenv('TWITTER_USERNAME')
USER_PASSWORD = os.getenv('TWITTER_PASSWORD')
except Exception as e:
print(f"Error retrieving environment variables: {e}")
USER_UNAME = None
USER_PASSWORD = None
sys.exit(1)
parser = argparse.ArgumentParser(description='Twitter Scraper')
parser.add_argument('--tweets', type=int, default=50,
help='Number of tweets to scrape (default: 50)')
args = parser.parse_args()
if USER_UNAME is not None and USER_PASSWORD is not None:
try: try:
USER_UNAME = os.getenv("TWITTER_USERNAME")
USER_PASSWORD = os.getenv("TWITTER_PASSWORD")
except Exception as e:
print(f"Error retrieving environment variables: {e}")
USER_UNAME = None
USER_PASSWORD = None
sys.exit(1)
if USER_UNAME is None:
USER_UNAME = input("Twitter Username: ")
if USER_PASSWORD is None:
USER_PASSWORD = getpass.getpass("Enter Password: ")
print()
parser = argparse.ArgumentParser(description="Twitter Scraper")
parser.add_argument(
"--tweets",
type=int,
default=50,
help="Number of tweets to scrape (default: 50)",
)
args = parser.parse_args()
if USER_UNAME is not None and USER_PASSWORD is not None:
scraper = Twitter_Scraper( scraper = Twitter_Scraper(
username=USER_UNAME, username=USER_UNAME, password=USER_PASSWORD, max_tweets=args.tweets
password=USER_PASSWORD,
max_tweets=args.tweets
) )
scraper.scrape_tweets() scraper.scrape_tweets()
scraper.driver.close()
scraper.save_to_csv() scraper.save_to_csv()
except KeyboardInterrupt: scraper.driver.close()
print("\nScript Interrupted by user. Exiting...") else:
print(
"Missing Twitter username or password environment variables. Please check your .env file."
)
sys.exit(1) sys.exit(1)
else: except KeyboardInterrupt:
print("Missing Twitter username or password environment variables. Please check your .env file.") print("\nScript Interrupted by user. Exiting...")
sys.exit(1) sys.exit(1)
if __name__ == '__main__': if __name__ == "__main__":
main() main()

View File

@@ -11,8 +11,15 @@ class Progress:
self.current = current self.current = current
progress = current / self.total progress = current / self.total
bar_length = 40 bar_length = 40
progress_bar = "[" + "=" * int(bar_length * progress) + \ progress_bar = (
"-" * (bar_length - int(bar_length * progress)) + "]" "["
+ "=" * int(bar_length * progress)
+ "-" * (bar_length - int(bar_length * progress))
+ "]"
)
sys.stdout.write( sys.stdout.write(
"\rProgress: [{:<40}] {:.2%} {} of {}".format(progress_bar, progress, current, self.total)) "\rProgress: [{:<40}] {:.2%} {} of {}".format(
progress_bar, progress, current, self.total
)
)
sys.stdout.flush() sys.stdout.flush()

View File

@@ -1,27 +1,26 @@
class Scroller(): class Scroller:
def __init__(self, driver) -> None: def __init__(self, driver) -> None:
self.driver = driver self.driver = driver
self.current_position = 0 self.current_position = 0
self.last_position = driver.execute_script("return window.pageYOffset;") self.last_position = driver.execute_script("return window.pageYOffset;")
self.scrolling = True self.scrolling = True
self.scroll_count = 0 self.scroll_count = 0
pass pass
def reset(self) -> None: def reset(self) -> None:
self.current_position = 0 self.current_position = 0
self.last_position = self.driver.execute_script("return window.pageYOffset;") self.last_position = self.driver.execute_script("return window.pageYOffset;")
self.scroll_count = 0 self.scroll_count = 0
pass pass
def scroll_to_top(self) -> None: def scroll_to_top(self) -> None:
self.driver.execute_script("window.scrollTo(0, 0);") self.driver.execute_script("window.scrollTo(0, 0);")
pass pass
def scroll_to_bottom(self) -> None: def scroll_to_bottom(self) -> None:
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
pass pass
def update_scroll_position(self) -> None: def update_scroll_position(self) -> None:
self.current_position = self.driver.execute_script("return window.pageYOffset;") self.current_position = self.driver.execute_script("return window.pageYOffset;")
pass pass

View File

@@ -7,23 +7,20 @@ class Tweet:
self.card = card self.card = card
self.user = card.find_element( self.user = card.find_element(
'xpath', "xpath", './/div[@data-testid="User-Name"]//span'
'.//div[@data-testid="User-Name"]//span'
).text ).text
try: try:
self.handle = card.find_element( self.handle = card.find_element(
'xpath', "xpath", './/span[contains(text(), "@")]'
'.//span[contains(text(), "@")]'
).text ).text
except NoSuchElementException: except NoSuchElementException:
return return
try: try:
self.date_time = card.find_element( self.date_time = card.find_element("xpath", ".//time").get_attribute(
'xpath', "datetime"
'.//time' )
).get_attribute('datetime')
if self.date_time is not None: if self.date_time is not None:
self.is_ad = False self.is_ad = False
@@ -33,8 +30,7 @@ class Tweet:
try: try:
card.find_element( card.find_element(
'xpath', "xpath", './/*[local-name()="svg" and @data-testid="icon-verified"]'
'.//*[local-name()="svg" and @data-testid="icon-verified"]'
) )
self.verified = True self.verified = True
@@ -43,8 +39,8 @@ class Tweet:
self.content = "" self.content = ""
contents = card.find_elements( contents = card.find_elements(
'xpath', "xpath",
'.//div[@data-testid="tweetText"]/span | .//div[@data-testid="tweetText"]/a' './/div[@data-testid="tweetText"]/span | .//div[@data-testid="tweetText"]/a',
) )
for index, content in enumerate(contents): for index, content in enumerate(contents):
@@ -52,43 +48,38 @@ class Tweet:
try: try:
self.reply_cnt = card.find_element( self.reply_cnt = card.find_element(
'xpath', "xpath", './/div[@data-testid="reply"]//span'
'.//div[@data-testid="reply"]//span'
).text ).text
except NoSuchElementException: except NoSuchElementException:
self.reply_cnt = '0' self.reply_cnt = "0"
try: try:
self.retweet_cnt = card.find_element( self.retweet_cnt = card.find_element(
'xpath', "xpath", './/div[@data-testid="retweet"]//span'
'.//div[@data-testid="retweet"]//span'
).text ).text
except NoSuchElementException: except NoSuchElementException:
self.retweet_cnt = '0' self.retweet_cnt = "0"
try: try:
self.like_cnt = card.find_element( self.like_cnt = card.find_element(
'xpath', "xpath", './/div[@data-testid="like"]//span'
'.//div[@data-testid="like"]//span'
).text ).text
except NoSuchElementException: except NoSuchElementException:
self.like_cnt = '0' self.like_cnt = "0"
try: try:
self.analytics_cnt = card.find_element( self.analytics_cnt = card.find_element(
'xpath', "xpath", './/a[contains(@href, "/analytics")]//span'
'.//a[contains(@href, "/analytics")]//span'
).text ).text
except NoSuchElementException: except NoSuchElementException:
self.analytics_cnt = '0' self.analytics_cnt = "0"
try: try:
self.profile_img = card.find_element( self.profile_img = card.find_element(
'xpath', "xpath", './/div[@data-testid="Tweet-User-Avatar"]//img'
'.//div[@data-testid="Tweet-User-Avatar"]//img' ).get_attribute("src")
).get_attribute('src')
except NoSuchElementException: except NoSuchElementException:
self.profile_img = '' self.profile_img = ""
self.tweet = ( self.tweet = (
self.user, self.user,
@@ -100,7 +91,7 @@ class Tweet:
self.retweet_cnt, self.retweet_cnt,
self.like_cnt, self.like_cnt,
self.analytics_cnt, self.analytics_cnt,
self.profile_img self.profile_img,
) )
pass pass

View File

@@ -10,7 +10,11 @@ from fake_headers import Headers
from time import sleep from time import sleep
from selenium import webdriver from selenium import webdriver
from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException, WebDriverException from selenium.common.exceptions import (
NoSuchElementException,
StaleElementReferenceException,
WebDriverException,
)
from selenium.webdriver.chrome.options import Options as ChromeOptions from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.chrome.service import Service as ChromeService from selenium.webdriver.chrome.service import Service as ChromeService
@@ -20,7 +24,7 @@ from webdriver_manager.chrome import ChromeDriverManager
TWITTER_LOGIN_URL = "https://twitter.com/i/flow/login" TWITTER_LOGIN_URL = "https://twitter.com/i/flow/login"
class Twitter_Scraper(): class Twitter_Scraper:
def __init__(self, username, password, max_tweets=50): def __init__(self, username, password, max_tweets=50):
print("Initializing Twitter Scraper...") print("Initializing Twitter Scraper...")
self.username = username self.username = username
@@ -36,17 +40,17 @@ class Twitter_Scraper():
def _get_driver(self): def _get_driver(self):
print("Setup WebDriver...") print("Setup WebDriver...")
header = Headers().generate()['User-Agent'] header = Headers().generate()["User-Agent"]
browser_option = ChromeOptions() browser_option = ChromeOptions()
browser_option.add_argument('--no-sandbox') browser_option.add_argument("--no-sandbox")
browser_option.add_argument("--disable-dev-shm-usage") browser_option.add_argument("--disable-dev-shm-usage")
browser_option.add_argument('--ignore-certificate-errors') browser_option.add_argument("--ignore-certificate-errors")
browser_option.add_argument('--disable-gpu') browser_option.add_argument("--disable-gpu")
browser_option.add_argument('--log-level=3') browser_option.add_argument("--log-level=3")
browser_option.add_argument('--disable-notifications') browser_option.add_argument("--disable-notifications")
browser_option.add_argument('--disable-popup-blocking') browser_option.add_argument("--disable-popup-blocking")
browser_option.add_argument('--user-agent={}'.format(header)) browser_option.add_argument("--user-agent={}".format(header))
# For Hiding Browser # For Hiding Browser
browser_option.add_argument("--headless") browser_option.add_argument("--headless")
@@ -62,8 +66,7 @@ class Twitter_Scraper():
try: try:
print("Downloading ChromeDriver...") print("Downloading ChromeDriver...")
chromedriver_path = ChromeDriverManager().install() chromedriver_path = ChromeDriverManager().install()
chrome_service = ChromeService( chrome_service = ChromeService(executable_path=chromedriver_path)
executable_path=chromedriver_path)
print("Initializing ChromeDriver...") print("Initializing ChromeDriver...")
driver = webdriver.Chrome( driver = webdriver.Chrome(
@@ -96,8 +99,7 @@ class Twitter_Scraper():
while True: while True:
try: try:
username = self.driver.find_element( username = self.driver.find_element(
"xpath", "xpath", "//input[@autocomplete='username']"
"//input[@autocomplete='username']"
) )
username.send_keys(self.username) username.send_keys(self.username)
@@ -108,18 +110,19 @@ class Twitter_Scraper():
input_attempt += 1 input_attempt += 1
if input_attempt >= 3: if input_attempt >= 3:
print() print()
print(""" print(
There was an error inputting the username. """There was an error inputting the username.
It may be due to the following: It may be due to the following:
- Internet connection is unstable - Internet connection is unstable
- Username is incorrect - Username is incorrect
- Twitter is experiencing unusual activity - Twitter is experiencing unusual activity"""
""") )
self.driver.quit() self.driver.quit()
sys.exit(1) sys.exit(1)
else: else:
print("Re-attempting to input username...") print("Re-attempting to input username...")
sleep(2)
def _input_unusual_activity(self): def _input_unusual_activity(self):
input_attempt = 0 input_attempt = 0
@@ -127,8 +130,7 @@ It may be due to the following:
while True: while True:
try: try:
unusual_activity = self.driver.find_element( unusual_activity = self.driver.find_element(
"xpath", "xpath", "//input[@data-testid='ocfEnterTextTextInput']"
"//input[@data-testid='ocfEnterTextTextInput']"
) )
unusual_activity.send_keys(self.username) unusual_activity.send_keys(self.username)
unusual_activity.send_keys(Keys.RETURN) unusual_activity.send_keys(Keys.RETURN)
@@ -145,8 +147,7 @@ It may be due to the following:
while True: while True:
try: try:
password = self.driver.find_element( password = self.driver.find_element(
"xpath", "xpath", "//input[@autocomplete='current-password']"
"//input[@autocomplete='current-password']"
) )
password.send_keys(self.password) password.send_keys(self.password)
@@ -157,18 +158,19 @@ It may be due to the following:
input_attempt += 1 input_attempt += 1
if input_attempt >= 3: if input_attempt >= 3:
print() print()
print(""" print(
There was an error inputting the password. """There was an error inputting the password.
It may be due to the following: It may be due to the following:
- Internet connection is unstable - Internet connection is unstable
- Password is incorrect - Password is incorrect
- Twitter is experiencing unusual activity - Twitter is experiencing unusual activity"""
""") )
self.driver.quit() self.driver.quit()
sys.exit(1) sys.exit(1)
else: else:
print("Re-attempting to input password...") print("Re-attempting to input password...")
sleep(2)
def go_to_home(self): def go_to_home(self):
self.driver.get("https://twitter.com/home") self.driver.get("https://twitter.com/home")
@@ -177,8 +179,7 @@ It may be due to the following:
def get_tweets(self): def get_tweets(self):
self.tweet_cards = self.driver.find_elements( self.tweet_cards = self.driver.find_elements(
'xpath', "xpath", '//article[@data-testid="tweet"]'
'//article[@data-testid="tweet"]'
) )
pass pass
@@ -191,8 +192,8 @@ It may be due to the following:
print("Scraping Tweets...") print("Scraping Tweets...")
self.progress.print_progress(0) self.progress.print_progress(0)
try: while self.scroller.scrolling:
while self.scroller.scrolling: try:
self.get_tweets() self.get_tweets()
for card in self.tweet_cards[-15:]: for card in self.tweet_cards[-15:]:
@@ -235,11 +236,15 @@ It may be due to the following:
else: else:
self.scroller.last_position = self.scroller.current_position self.scroller.last_position = self.scroller.current_position
break break
except StaleElementReferenceException:
callback()
sleep(2)
print("\n") print("\n")
if len(self.data) >= self.max_tweets:
print("Scraping Complete") print("Scraping Complete")
except StaleElementReferenceException: else:
print("\n")
print("Scraping Incomplete") print("Scraping Incomplete")
print("Tweets: {} out of {}\n".format(len(self.data), self.max_tweets)) print("Tweets: {} out of {}\n".format(len(self.data), self.max_tweets))
@@ -249,29 +254,29 @@ It may be due to the following:
def save_to_csv(self): def save_to_csv(self):
print("Saving Tweets to CSV...") print("Saving Tweets to CSV...")
now = datetime.now() now = datetime.now()
folder_path = './tweets/' folder_path = "./tweets/"
if not os.path.exists(folder_path): if not os.path.exists(folder_path):
os.makedirs(folder_path) os.makedirs(folder_path)
print("Created Folder: {}".format(folder_path)) print("Created Folder: {}".format(folder_path))
data = { data = {
'Name': [tweet[0] for tweet in self.data], "Name": [tweet[0] for tweet in self.data],
'Handle': [tweet[1] for tweet in self.data], "Handle": [tweet[1] for tweet in self.data],
'Timestamp': [tweet[2] for tweet in self.data], "Timestamp": [tweet[2] for tweet in self.data],
'Verified': [tweet[3] for tweet in self.data], "Verified": [tweet[3] for tweet in self.data],
'Content': [tweet[4] for tweet in self.data], "Content": [tweet[4] for tweet in self.data],
'Comments': [tweet[5] for tweet in self.data], "Comments": [tweet[5] for tweet in self.data],
'Retweets': [tweet[6] for tweet in self.data], "Retweets": [tweet[6] for tweet in self.data],
'Likes': [tweet[7] for tweet in self.data], "Likes": [tweet[7] for tweet in self.data],
'Analytics': [tweet[8] for tweet in self.data], "Analytics": [tweet[8] for tweet in self.data],
'Profile Image': [tweet[9] for tweet in self.data], "Profile Image": [tweet[9] for tweet in self.data],
} }
df = pd.DataFrame(data) df = pd.DataFrame(data)
current_time = now.strftime("%Y-%m-%d_%H-%M-%S") current_time = now.strftime("%Y-%m-%d_%H-%M-%S")
file_path = f'{folder_path}{current_time}_tweets_1-{len(self.data)}.csv' file_path = f"{folder_path}{current_time}_tweets_1-{len(self.data)}.csv"
df.to_csv(file_path, index=False) df.to_csv(file_path, index=False)
print("CSV Saved: {}".format(file_path)) print("CSV Saved: {}".format(file_path))