From 04c8949e5b250d3dad89623000df78f72cc5307f Mon Sep 17 00:00:00 2001 From: ReptilianPride <37742577+ReptilianPride@users.noreply.github.com> Date: Fri, 28 Feb 2025 00:24:45 +0000 Subject: [PATCH 1/6] Added choices for headless yes or no, along with user-agent related changes --- .env.example | 1 + scraper/__main__.py | 12 ++++++++++++ scraper/twitter_scraper.py | 14 +++++++++++--- 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/.env.example b/.env.example index e9510eb..217e8b8 100644 --- a/.env.example +++ b/.env.example @@ -1,2 +1,3 @@ TWITTER_USERNAME=# Your Twitter Handle TWITTER_PASSWORD=# Your Twitter Password +HEADLESS=# Headless Mode "yes" or "no" diff --git a/scraper/__main__.py b/scraper/__main__.py index 7676189..b9349d9 100644 --- a/scraper/__main__.py +++ b/scraper/__main__.py @@ -44,6 +44,13 @@ def main(): default=os.getenv("TWITTER_PASSWORD"), help="Your Twitter password.", ) + + parser.add_argument( + "--headlessState", + type=str, + default=os.getenv("HEADLESS"), + help="Headless mode? [yes/no]" + ) except Exception as e: print(f"Error retrieving environment variables: {e}") sys.exit(1) @@ -113,6 +120,7 @@ def main(): USER_MAIL = args.mail USER_UNAME = args.user USER_PASSWORD = args.password + HEADLESS_MODE= args.headlessState if USER_UNAME is None: USER_UNAME = input("Twitter Username: ") @@ -120,6 +128,9 @@ def main(): if USER_PASSWORD is None: USER_PASSWORD = getpass.getpass("Enter Password: ") + if HEADLESS_MODE is None: + HEADLESS_MODE - str(input("Headless?[Yes/No]")).lower() + print() tweet_type_args = [] @@ -146,6 +157,7 @@ def main(): mail=USER_MAIL, username=USER_UNAME, password=USER_PASSWORD, + headlessState=HEADLESS_MODE ) scraper.login() scraper.scrape_tweets( diff --git a/scraper/twitter_scraper.py b/scraper/twitter_scraper.py index 4fd0fcd..9aa3883 100644 --- a/scraper/twitter_scraper.py +++ b/scraper/twitter_scraper.py @@ -37,6 +37,7 @@ class Twitter_Scraper: mail, username, password, + headlessState, max_tweets=50, scrape_username=None, scrape_hashtag=None, @@ -50,6 +51,7 @@ class Twitter_Scraper: self.mail = mail self.username = username self.password = password + self.headlessState = headlessState self.interrupted = False self.tweet_ids = set() self.data = [] @@ -125,7 +127,10 @@ class Twitter_Scraper: proxy=None, ): print("Setup WebDriver...") - header = Headers().generate()["User-Agent"] + # header = Headers().generate()["User-Agent"] + + # User agent of a andoird smartphone device + header="Mozilla/5.0 (Linux; Android 11; SM-G998B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.5414.87 Mobile Safari/537.36" # browser_option = ChromeOptions() browser_option = FirefoxOptions() @@ -140,8 +145,11 @@ class Twitter_Scraper: if proxy is not None: browser_option.add_argument("--proxy-server=%s" % proxy) - # For Hiding Browser - browser_option.add_argument("--headless") + # Option to hide browser or not + # If not yes then skips the headless + if self.headlessState == 'yes': + # For Hiding Browser + browser_option.add_argument("--headless") try: # print("Initializing ChromeDriver...") From 3800c02a5966774c34341423b631f40f1727b470 Mon Sep 17 00:00:00 2001 From: ReptilianPride <37742577+ReptilianPride@users.noreply.github.com> Date: Sat, 1 Mar 2025 10:44:14 +0000 Subject: [PATCH 2/6] Retry button wait made longer --- scraper/twitter_scraper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scraper/twitter_scraper.py b/scraper/twitter_scraper.py index 9aa3883..ff33b14 100644 --- a/scraper/twitter_scraper.py +++ b/scraper/twitter_scraper.py @@ -498,7 +498,7 @@ It may be due to the following: retry_button = self.driver.find_element( "xpath", "//span[text()='Retry']/../../..") self.progress.print_progress(len(self.data), True, retry_cnt, no_tweets_limit) - sleep(58) + sleep(300) retry_button.click() retry_cnt += 1 sleep(2) From 380223784250739cb2ed10c6b45813000407d86d Mon Sep 17 00:00:00 2001 From: ReptilianPride <37742577+ReptilianPride@users.noreply.github.com> Date: Sat, 1 Mar 2025 14:06:15 +0000 Subject: [PATCH 3/6] added 10 mins delay for the retry button occurence --- .env.example | 3 --- scraper/twitter_scraper.py | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) delete mode 100644 .env.example diff --git a/.env.example b/.env.example deleted file mode 100644 index 217e8b8..0000000 --- a/.env.example +++ /dev/null @@ -1,3 +0,0 @@ -TWITTER_USERNAME=# Your Twitter Handle -TWITTER_PASSWORD=# Your Twitter Password -HEADLESS=# Headless Mode "yes" or "no" diff --git a/scraper/twitter_scraper.py b/scraper/twitter_scraper.py index ff33b14..4095129 100644 --- a/scraper/twitter_scraper.py +++ b/scraper/twitter_scraper.py @@ -498,7 +498,7 @@ It may be due to the following: retry_button = self.driver.find_element( "xpath", "//span[text()='Retry']/../../..") self.progress.print_progress(len(self.data), True, retry_cnt, no_tweets_limit) - sleep(300) + sleep(600) retry_button.click() retry_cnt += 1 sleep(2) From de0ef576632f8826eef794e51026c88f0fa5e09b Mon Sep 17 00:00:00 2001 From: ReptilianPride <37742577+ReptilianPride@users.noreply.github.com> Date: Sat, 1 Mar 2025 14:22:55 +0000 Subject: [PATCH 4/6] Re-added .env.example file --- .env.example | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .env.example diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..3c33a8f --- /dev/null +++ b/.env.example @@ -0,0 +1,3 @@ +TWITTER_USERNAME=# Your Twitter Handle +TWITTER_PASSWORD=# Your Twitter Password +HEADLESS=# Headless browser option (use "yes" or "no") From e989d9e8a7ef4e61e04daa34cb36daca6183a1b9 Mon Sep 17 00:00:00 2001 From: ReptilianPride <37742577+ReptilianPride@users.noreply.github.com> Date: Sat, 1 Mar 2025 17:39:04 +0000 Subject: [PATCH 5/6] Increased zoom and unchanged scroll delay --- scraper/scroller.py | 4 ++++ scraper/twitter_scraper.py | 1 + 2 files changed, 5 insertions(+) diff --git a/scraper/scroller.py b/scraper/scroller.py index 24ea636..fdb0d45 100644 --- a/scraper/scroller.py +++ b/scraper/scroller.py @@ -1,3 +1,7 @@ +import time +import random + + class Scroller: def __init__(self, driver) -> None: self.driver = driver diff --git a/scraper/twitter_scraper.py b/scraper/twitter_scraper.py index 4095129..750cfb1 100644 --- a/scraper/twitter_scraper.py +++ b/scraper/twitter_scraper.py @@ -199,6 +199,7 @@ class Twitter_Scraper: try: self.driver.maximize_window() + self.driver.execute_script("document.body.style.zoom='150%'") #set zoom to 150% self.driver.get(TWITTER_LOGIN_URL) sleep(3) From dd223a2134208f5007aba92b1fa7c0164cac255f Mon Sep 17 00:00:00 2001 From: ReptilianPride <37742577+ReptilianPride@users.noreply.github.com> Date: Sat, 1 Mar 2025 17:40:49 +0000 Subject: [PATCH 6/6] added sample command --- sample-command.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 sample-command.txt diff --git a/sample-command.txt b/sample-command.txt new file mode 100644 index 0000000..b9630b1 --- /dev/null +++ b/sample-command.txt @@ -0,0 +1 @@ +python scraper --query='("NVDA" OR "nvidia") lang:en until:2024-01-19 since:2024-01-18' -t 5000 --top