add error handling and script runs until unexpected error or max reached
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
import getpass
|
||||
from twitter_scraper import Twitter_Scraper
|
||||
|
||||
try:
|
||||
@@ -16,37 +17,49 @@ except Exception as e:
|
||||
|
||||
def main():
|
||||
try:
|
||||
USER_UNAME = os.getenv('TWITTER_USERNAME')
|
||||
USER_PASSWORD = os.getenv('TWITTER_PASSWORD')
|
||||
except Exception as e:
|
||||
print(f"Error retrieving environment variables: {e}")
|
||||
USER_UNAME = None
|
||||
USER_PASSWORD = None
|
||||
sys.exit(1)
|
||||
|
||||
parser = argparse.ArgumentParser(description='Twitter Scraper')
|
||||
parser.add_argument('--tweets', type=int, default=50,
|
||||
help='Number of tweets to scrape (default: 50)')
|
||||
args = parser.parse_args()
|
||||
|
||||
if USER_UNAME is not None and USER_PASSWORD is not None:
|
||||
try:
|
||||
USER_UNAME = os.getenv("TWITTER_USERNAME")
|
||||
USER_PASSWORD = os.getenv("TWITTER_PASSWORD")
|
||||
except Exception as e:
|
||||
print(f"Error retrieving environment variables: {e}")
|
||||
USER_UNAME = None
|
||||
USER_PASSWORD = None
|
||||
sys.exit(1)
|
||||
|
||||
if USER_UNAME is None:
|
||||
USER_UNAME = input("Twitter Username: ")
|
||||
|
||||
if USER_PASSWORD is None:
|
||||
USER_PASSWORD = getpass.getpass("Enter Password: ")
|
||||
|
||||
print()
|
||||
|
||||
parser = argparse.ArgumentParser(description="Twitter Scraper")
|
||||
parser.add_argument(
|
||||
"--tweets",
|
||||
type=int,
|
||||
default=50,
|
||||
help="Number of tweets to scrape (default: 50)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if USER_UNAME is not None and USER_PASSWORD is not None:
|
||||
scraper = Twitter_Scraper(
|
||||
username=USER_UNAME,
|
||||
password=USER_PASSWORD,
|
||||
max_tweets=args.tweets
|
||||
username=USER_UNAME, password=USER_PASSWORD, max_tweets=args.tweets
|
||||
)
|
||||
|
||||
scraper.scrape_tweets()
|
||||
scraper.driver.close()
|
||||
scraper.save_to_csv()
|
||||
except KeyboardInterrupt:
|
||||
print("\nScript Interrupted by user. Exiting...")
|
||||
scraper.driver.close()
|
||||
else:
|
||||
print(
|
||||
"Missing Twitter username or password environment variables. Please check your .env file."
|
||||
)
|
||||
sys.exit(1)
|
||||
else:
|
||||
print("Missing Twitter username or password environment variables. Please check your .env file.")
|
||||
except KeyboardInterrupt:
|
||||
print("\nScript Interrupted by user. Exiting...")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -11,8 +11,15 @@ class Progress:
|
||||
self.current = current
|
||||
progress = current / self.total
|
||||
bar_length = 40
|
||||
progress_bar = "[" + "=" * int(bar_length * progress) + \
|
||||
"-" * (bar_length - int(bar_length * progress)) + "]"
|
||||
progress_bar = (
|
||||
"["
|
||||
+ "=" * int(bar_length * progress)
|
||||
+ "-" * (bar_length - int(bar_length * progress))
|
||||
+ "]"
|
||||
)
|
||||
sys.stdout.write(
|
||||
"\rProgress: [{:<40}] {:.2%} {} of {}".format(progress_bar, progress, current, self.total))
|
||||
"\rProgress: [{:<40}] {:.2%} {} of {}".format(
|
||||
progress_bar, progress, current, self.total
|
||||
)
|
||||
)
|
||||
sys.stdout.flush()
|
||||
|
||||
@@ -1,27 +1,26 @@
|
||||
class Scroller():
|
||||
def __init__(self, driver) -> None:
|
||||
self.driver = driver
|
||||
self.current_position = 0
|
||||
self.last_position = driver.execute_script("return window.pageYOffset;")
|
||||
self.scrolling = True
|
||||
self.scroll_count = 0
|
||||
pass
|
||||
|
||||
def reset(self) -> None:
|
||||
self.current_position = 0
|
||||
self.last_position = self.driver.execute_script("return window.pageYOffset;")
|
||||
self.scroll_count = 0
|
||||
pass
|
||||
|
||||
def scroll_to_top(self) -> None:
|
||||
self.driver.execute_script("window.scrollTo(0, 0);")
|
||||
pass
|
||||
|
||||
def scroll_to_bottom(self) -> None:
|
||||
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
||||
pass
|
||||
|
||||
def update_scroll_position(self) -> None:
|
||||
self.current_position = self.driver.execute_script("return window.pageYOffset;")
|
||||
pass
|
||||
|
||||
class Scroller:
|
||||
def __init__(self, driver) -> None:
|
||||
self.driver = driver
|
||||
self.current_position = 0
|
||||
self.last_position = driver.execute_script("return window.pageYOffset;")
|
||||
self.scrolling = True
|
||||
self.scroll_count = 0
|
||||
pass
|
||||
|
||||
def reset(self) -> None:
|
||||
self.current_position = 0
|
||||
self.last_position = self.driver.execute_script("return window.pageYOffset;")
|
||||
self.scroll_count = 0
|
||||
pass
|
||||
|
||||
def scroll_to_top(self) -> None:
|
||||
self.driver.execute_script("window.scrollTo(0, 0);")
|
||||
pass
|
||||
|
||||
def scroll_to_bottom(self) -> None:
|
||||
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
||||
pass
|
||||
|
||||
def update_scroll_position(self) -> None:
|
||||
self.current_position = self.driver.execute_script("return window.pageYOffset;")
|
||||
pass
|
||||
|
||||
@@ -7,23 +7,20 @@ class Tweet:
|
||||
self.card = card
|
||||
|
||||
self.user = card.find_element(
|
||||
'xpath',
|
||||
'.//div[@data-testid="User-Name"]//span'
|
||||
"xpath", './/div[@data-testid="User-Name"]//span'
|
||||
).text
|
||||
|
||||
try:
|
||||
self.handle = card.find_element(
|
||||
'xpath',
|
||||
'.//span[contains(text(), "@")]'
|
||||
"xpath", './/span[contains(text(), "@")]'
|
||||
).text
|
||||
except NoSuchElementException:
|
||||
return
|
||||
|
||||
try:
|
||||
self.date_time = card.find_element(
|
||||
'xpath',
|
||||
'.//time'
|
||||
).get_attribute('datetime')
|
||||
self.date_time = card.find_element("xpath", ".//time").get_attribute(
|
||||
"datetime"
|
||||
)
|
||||
|
||||
if self.date_time is not None:
|
||||
self.is_ad = False
|
||||
@@ -33,8 +30,7 @@ class Tweet:
|
||||
|
||||
try:
|
||||
card.find_element(
|
||||
'xpath',
|
||||
'.//*[local-name()="svg" and @data-testid="icon-verified"]'
|
||||
"xpath", './/*[local-name()="svg" and @data-testid="icon-verified"]'
|
||||
)
|
||||
|
||||
self.verified = True
|
||||
@@ -43,8 +39,8 @@ class Tweet:
|
||||
|
||||
self.content = ""
|
||||
contents = card.find_elements(
|
||||
'xpath',
|
||||
'.//div[@data-testid="tweetText"]/span | .//div[@data-testid="tweetText"]/a'
|
||||
"xpath",
|
||||
'.//div[@data-testid="tweetText"]/span | .//div[@data-testid="tweetText"]/a',
|
||||
)
|
||||
|
||||
for index, content in enumerate(contents):
|
||||
@@ -52,43 +48,38 @@ class Tweet:
|
||||
|
||||
try:
|
||||
self.reply_cnt = card.find_element(
|
||||
'xpath',
|
||||
'.//div[@data-testid="reply"]//span'
|
||||
"xpath", './/div[@data-testid="reply"]//span'
|
||||
).text
|
||||
except NoSuchElementException:
|
||||
self.reply_cnt = '0'
|
||||
self.reply_cnt = "0"
|
||||
|
||||
try:
|
||||
self.retweet_cnt = card.find_element(
|
||||
'xpath',
|
||||
'.//div[@data-testid="retweet"]//span'
|
||||
"xpath", './/div[@data-testid="retweet"]//span'
|
||||
).text
|
||||
except NoSuchElementException:
|
||||
self.retweet_cnt = '0'
|
||||
self.retweet_cnt = "0"
|
||||
|
||||
try:
|
||||
self.like_cnt = card.find_element(
|
||||
'xpath',
|
||||
'.//div[@data-testid="like"]//span'
|
||||
"xpath", './/div[@data-testid="like"]//span'
|
||||
).text
|
||||
except NoSuchElementException:
|
||||
self.like_cnt = '0'
|
||||
self.like_cnt = "0"
|
||||
|
||||
try:
|
||||
self.analytics_cnt = card.find_element(
|
||||
'xpath',
|
||||
'.//a[contains(@href, "/analytics")]//span'
|
||||
"xpath", './/a[contains(@href, "/analytics")]//span'
|
||||
).text
|
||||
except NoSuchElementException:
|
||||
self.analytics_cnt = '0'
|
||||
self.analytics_cnt = "0"
|
||||
|
||||
try:
|
||||
self.profile_img = card.find_element(
|
||||
'xpath',
|
||||
'.//div[@data-testid="Tweet-User-Avatar"]//img'
|
||||
).get_attribute('src')
|
||||
"xpath", './/div[@data-testid="Tweet-User-Avatar"]//img'
|
||||
).get_attribute("src")
|
||||
except NoSuchElementException:
|
||||
self.profile_img = ''
|
||||
self.profile_img = ""
|
||||
|
||||
self.tweet = (
|
||||
self.user,
|
||||
@@ -100,7 +91,7 @@ class Tweet:
|
||||
self.retweet_cnt,
|
||||
self.like_cnt,
|
||||
self.analytics_cnt,
|
||||
self.profile_img
|
||||
self.profile_img,
|
||||
)
|
||||
|
||||
pass
|
||||
|
||||
@@ -10,7 +10,11 @@ from fake_headers import Headers
|
||||
from time import sleep
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.keys import Keys
|
||||
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException, WebDriverException
|
||||
from selenium.common.exceptions import (
|
||||
NoSuchElementException,
|
||||
StaleElementReferenceException,
|
||||
WebDriverException,
|
||||
)
|
||||
|
||||
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
||||
from selenium.webdriver.chrome.service import Service as ChromeService
|
||||
@@ -20,7 +24,7 @@ from webdriver_manager.chrome import ChromeDriverManager
|
||||
TWITTER_LOGIN_URL = "https://twitter.com/i/flow/login"
|
||||
|
||||
|
||||
class Twitter_Scraper():
|
||||
class Twitter_Scraper:
|
||||
def __init__(self, username, password, max_tweets=50):
|
||||
print("Initializing Twitter Scraper...")
|
||||
self.username = username
|
||||
@@ -36,17 +40,17 @@ class Twitter_Scraper():
|
||||
|
||||
def _get_driver(self):
|
||||
print("Setup WebDriver...")
|
||||
header = Headers().generate()['User-Agent']
|
||||
header = Headers().generate()["User-Agent"]
|
||||
|
||||
browser_option = ChromeOptions()
|
||||
browser_option.add_argument('--no-sandbox')
|
||||
browser_option.add_argument("--no-sandbox")
|
||||
browser_option.add_argument("--disable-dev-shm-usage")
|
||||
browser_option.add_argument('--ignore-certificate-errors')
|
||||
browser_option.add_argument('--disable-gpu')
|
||||
browser_option.add_argument('--log-level=3')
|
||||
browser_option.add_argument('--disable-notifications')
|
||||
browser_option.add_argument('--disable-popup-blocking')
|
||||
browser_option.add_argument('--user-agent={}'.format(header))
|
||||
browser_option.add_argument("--ignore-certificate-errors")
|
||||
browser_option.add_argument("--disable-gpu")
|
||||
browser_option.add_argument("--log-level=3")
|
||||
browser_option.add_argument("--disable-notifications")
|
||||
browser_option.add_argument("--disable-popup-blocking")
|
||||
browser_option.add_argument("--user-agent={}".format(header))
|
||||
|
||||
# For Hiding Browser
|
||||
browser_option.add_argument("--headless")
|
||||
@@ -62,8 +66,7 @@ class Twitter_Scraper():
|
||||
try:
|
||||
print("Downloading ChromeDriver...")
|
||||
chromedriver_path = ChromeDriverManager().install()
|
||||
chrome_service = ChromeService(
|
||||
executable_path=chromedriver_path)
|
||||
chrome_service = ChromeService(executable_path=chromedriver_path)
|
||||
|
||||
print("Initializing ChromeDriver...")
|
||||
driver = webdriver.Chrome(
|
||||
@@ -96,8 +99,7 @@ class Twitter_Scraper():
|
||||
while True:
|
||||
try:
|
||||
username = self.driver.find_element(
|
||||
"xpath",
|
||||
"//input[@autocomplete='username']"
|
||||
"xpath", "//input[@autocomplete='username']"
|
||||
)
|
||||
|
||||
username.send_keys(self.username)
|
||||
@@ -108,18 +110,19 @@ class Twitter_Scraper():
|
||||
input_attempt += 1
|
||||
if input_attempt >= 3:
|
||||
print()
|
||||
print("""
|
||||
There was an error inputting the username.
|
||||
print(
|
||||
"""There was an error inputting the username.
|
||||
|
||||
It may be due to the following:
|
||||
- Internet connection is unstable
|
||||
- Username is incorrect
|
||||
- Twitter is experiencing unusual activity
|
||||
""")
|
||||
- Twitter is experiencing unusual activity"""
|
||||
)
|
||||
self.driver.quit()
|
||||
sys.exit(1)
|
||||
else:
|
||||
print("Re-attempting to input username...")
|
||||
sleep(2)
|
||||
|
||||
def _input_unusual_activity(self):
|
||||
input_attempt = 0
|
||||
@@ -127,8 +130,7 @@ It may be due to the following:
|
||||
while True:
|
||||
try:
|
||||
unusual_activity = self.driver.find_element(
|
||||
"xpath",
|
||||
"//input[@data-testid='ocfEnterTextTextInput']"
|
||||
"xpath", "//input[@data-testid='ocfEnterTextTextInput']"
|
||||
)
|
||||
unusual_activity.send_keys(self.username)
|
||||
unusual_activity.send_keys(Keys.RETURN)
|
||||
@@ -145,8 +147,7 @@ It may be due to the following:
|
||||
while True:
|
||||
try:
|
||||
password = self.driver.find_element(
|
||||
"xpath",
|
||||
"//input[@autocomplete='current-password']"
|
||||
"xpath", "//input[@autocomplete='current-password']"
|
||||
)
|
||||
|
||||
password.send_keys(self.password)
|
||||
@@ -157,18 +158,19 @@ It may be due to the following:
|
||||
input_attempt += 1
|
||||
if input_attempt >= 3:
|
||||
print()
|
||||
print("""
|
||||
There was an error inputting the password.
|
||||
print(
|
||||
"""There was an error inputting the password.
|
||||
|
||||
It may be due to the following:
|
||||
- Internet connection is unstable
|
||||
- Password is incorrect
|
||||
- Twitter is experiencing unusual activity
|
||||
""")
|
||||
- Twitter is experiencing unusual activity"""
|
||||
)
|
||||
self.driver.quit()
|
||||
sys.exit(1)
|
||||
else:
|
||||
print("Re-attempting to input password...")
|
||||
sleep(2)
|
||||
|
||||
def go_to_home(self):
|
||||
self.driver.get("https://twitter.com/home")
|
||||
@@ -177,8 +179,7 @@ It may be due to the following:
|
||||
|
||||
def get_tweets(self):
|
||||
self.tweet_cards = self.driver.find_elements(
|
||||
'xpath',
|
||||
'//article[@data-testid="tweet"]'
|
||||
"xpath", '//article[@data-testid="tweet"]'
|
||||
)
|
||||
pass
|
||||
|
||||
@@ -191,8 +192,8 @@ It may be due to the following:
|
||||
print("Scraping Tweets...")
|
||||
self.progress.print_progress(0)
|
||||
|
||||
try:
|
||||
while self.scroller.scrolling:
|
||||
while self.scroller.scrolling:
|
||||
try:
|
||||
self.get_tweets()
|
||||
|
||||
for card in self.tweet_cards[-15:]:
|
||||
@@ -235,11 +236,15 @@ It may be due to the following:
|
||||
else:
|
||||
self.scroller.last_position = self.scroller.current_position
|
||||
break
|
||||
except StaleElementReferenceException:
|
||||
callback()
|
||||
sleep(2)
|
||||
|
||||
print("\n")
|
||||
print("\n")
|
||||
|
||||
if len(self.data) >= self.max_tweets:
|
||||
print("Scraping Complete")
|
||||
except StaleElementReferenceException:
|
||||
print("\n")
|
||||
else:
|
||||
print("Scraping Incomplete")
|
||||
|
||||
print("Tweets: {} out of {}\n".format(len(self.data), self.max_tweets))
|
||||
@@ -249,29 +254,29 @@ It may be due to the following:
|
||||
def save_to_csv(self):
|
||||
print("Saving Tweets to CSV...")
|
||||
now = datetime.now()
|
||||
folder_path = './tweets/'
|
||||
folder_path = "./tweets/"
|
||||
|
||||
if not os.path.exists(folder_path):
|
||||
os.makedirs(folder_path)
|
||||
print("Created Folder: {}".format(folder_path))
|
||||
|
||||
data = {
|
||||
'Name': [tweet[0] for tweet in self.data],
|
||||
'Handle': [tweet[1] for tweet in self.data],
|
||||
'Timestamp': [tweet[2] for tweet in self.data],
|
||||
'Verified': [tweet[3] for tweet in self.data],
|
||||
'Content': [tweet[4] for tweet in self.data],
|
||||
'Comments': [tweet[5] for tweet in self.data],
|
||||
'Retweets': [tweet[6] for tweet in self.data],
|
||||
'Likes': [tweet[7] for tweet in self.data],
|
||||
'Analytics': [tweet[8] for tweet in self.data],
|
||||
'Profile Image': [tweet[9] for tweet in self.data],
|
||||
"Name": [tweet[0] for tweet in self.data],
|
||||
"Handle": [tweet[1] for tweet in self.data],
|
||||
"Timestamp": [tweet[2] for tweet in self.data],
|
||||
"Verified": [tweet[3] for tweet in self.data],
|
||||
"Content": [tweet[4] for tweet in self.data],
|
||||
"Comments": [tweet[5] for tweet in self.data],
|
||||
"Retweets": [tweet[6] for tweet in self.data],
|
||||
"Likes": [tweet[7] for tweet in self.data],
|
||||
"Analytics": [tweet[8] for tweet in self.data],
|
||||
"Profile Image": [tweet[9] for tweet in self.data],
|
||||
}
|
||||
|
||||
df = pd.DataFrame(data)
|
||||
|
||||
current_time = now.strftime("%Y-%m-%d_%H-%M-%S")
|
||||
file_path = f'{folder_path}{current_time}_tweets_1-{len(self.data)}.csv'
|
||||
file_path = f"{folder_path}{current_time}_tweets_1-{len(self.data)}.csv"
|
||||
df.to_csv(file_path, index=False)
|
||||
|
||||
print("CSV Saved: {}".format(file_path))
|
||||
|
||||
Reference in New Issue
Block a user