feat: Added choices for headless yes or no, along with user-agent related changes. (#27 from ReptilianPride/browser-options-update)
Added choices for headless yes or no, along with user-agent related changes.
This commit is contained in:
@@ -1,2 +1,3 @@
|
|||||||
TWITTER_USERNAME=# Your Twitter Handle
|
TWITTER_USERNAME=# Your Twitter Handle
|
||||||
TWITTER_PASSWORD=# Your Twitter Password
|
TWITTER_PASSWORD=# Your Twitter Password
|
||||||
|
HEADLESS=# Headless browser option (use "yes" or "no")
|
||||||
|
|||||||
1
sample-command.txt
Normal file
1
sample-command.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
python scraper --query='("NVDA" OR "nvidia") lang:en until:2024-01-19 since:2024-01-18' -t 5000 --top
|
||||||
@@ -44,6 +44,13 @@ def main():
|
|||||||
default=os.getenv("TWITTER_PASSWORD"),
|
default=os.getenv("TWITTER_PASSWORD"),
|
||||||
help="Your Twitter password.",
|
help="Your Twitter password.",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--headlessState",
|
||||||
|
type=str,
|
||||||
|
default=os.getenv("HEADLESS"),
|
||||||
|
help="Headless mode? [yes/no]"
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error retrieving environment variables: {e}")
|
print(f"Error retrieving environment variables: {e}")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
@@ -113,6 +120,7 @@ def main():
|
|||||||
USER_MAIL = args.mail
|
USER_MAIL = args.mail
|
||||||
USER_UNAME = args.user
|
USER_UNAME = args.user
|
||||||
USER_PASSWORD = args.password
|
USER_PASSWORD = args.password
|
||||||
|
HEADLESS_MODE= args.headlessState
|
||||||
|
|
||||||
if USER_UNAME is None:
|
if USER_UNAME is None:
|
||||||
USER_UNAME = input("Twitter Username: ")
|
USER_UNAME = input("Twitter Username: ")
|
||||||
@@ -120,6 +128,9 @@ def main():
|
|||||||
if USER_PASSWORD is None:
|
if USER_PASSWORD is None:
|
||||||
USER_PASSWORD = getpass.getpass("Enter Password: ")
|
USER_PASSWORD = getpass.getpass("Enter Password: ")
|
||||||
|
|
||||||
|
if HEADLESS_MODE is None:
|
||||||
|
HEADLESS_MODE - str(input("Headless?[Yes/No]")).lower()
|
||||||
|
|
||||||
print()
|
print()
|
||||||
|
|
||||||
tweet_type_args = []
|
tweet_type_args = []
|
||||||
@@ -146,6 +157,7 @@ def main():
|
|||||||
mail=USER_MAIL,
|
mail=USER_MAIL,
|
||||||
username=USER_UNAME,
|
username=USER_UNAME,
|
||||||
password=USER_PASSWORD,
|
password=USER_PASSWORD,
|
||||||
|
headlessState=HEADLESS_MODE
|
||||||
)
|
)
|
||||||
scraper.login()
|
scraper.login()
|
||||||
scraper.scrape_tweets(
|
scraper.scrape_tweets(
|
||||||
|
|||||||
@@ -1,3 +1,7 @@
|
|||||||
|
import time
|
||||||
|
import random
|
||||||
|
|
||||||
|
|
||||||
class Scroller:
|
class Scroller:
|
||||||
def __init__(self, driver) -> None:
|
def __init__(self, driver) -> None:
|
||||||
self.driver = driver
|
self.driver = driver
|
||||||
|
|||||||
@@ -37,6 +37,7 @@ class Twitter_Scraper:
|
|||||||
mail,
|
mail,
|
||||||
username,
|
username,
|
||||||
password,
|
password,
|
||||||
|
headlessState,
|
||||||
max_tweets=50,
|
max_tweets=50,
|
||||||
scrape_username=None,
|
scrape_username=None,
|
||||||
scrape_hashtag=None,
|
scrape_hashtag=None,
|
||||||
@@ -50,6 +51,7 @@ class Twitter_Scraper:
|
|||||||
self.mail = mail
|
self.mail = mail
|
||||||
self.username = username
|
self.username = username
|
||||||
self.password = password
|
self.password = password
|
||||||
|
self.headlessState = headlessState
|
||||||
self.interrupted = False
|
self.interrupted = False
|
||||||
self.tweet_ids = set()
|
self.tweet_ids = set()
|
||||||
self.data = []
|
self.data = []
|
||||||
@@ -125,7 +127,10 @@ class Twitter_Scraper:
|
|||||||
proxy=None,
|
proxy=None,
|
||||||
):
|
):
|
||||||
print("Setup WebDriver...")
|
print("Setup WebDriver...")
|
||||||
header = Headers().generate()["User-Agent"]
|
# header = Headers().generate()["User-Agent"]
|
||||||
|
|
||||||
|
# User agent of a andoird smartphone device
|
||||||
|
header="Mozilla/5.0 (Linux; Android 11; SM-G998B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.5414.87 Mobile Safari/537.36"
|
||||||
|
|
||||||
# browser_option = ChromeOptions()
|
# browser_option = ChromeOptions()
|
||||||
browser_option = FirefoxOptions()
|
browser_option = FirefoxOptions()
|
||||||
@@ -140,8 +145,11 @@ class Twitter_Scraper:
|
|||||||
if proxy is not None:
|
if proxy is not None:
|
||||||
browser_option.add_argument("--proxy-server=%s" % proxy)
|
browser_option.add_argument("--proxy-server=%s" % proxy)
|
||||||
|
|
||||||
# For Hiding Browser
|
# Option to hide browser or not
|
||||||
browser_option.add_argument("--headless")
|
# If not yes then skips the headless
|
||||||
|
if self.headlessState == 'yes':
|
||||||
|
# For Hiding Browser
|
||||||
|
browser_option.add_argument("--headless")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# print("Initializing ChromeDriver...")
|
# print("Initializing ChromeDriver...")
|
||||||
@@ -191,6 +199,7 @@ class Twitter_Scraper:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
self.driver.maximize_window()
|
self.driver.maximize_window()
|
||||||
|
self.driver.execute_script("document.body.style.zoom='150%'") #set zoom to 150%
|
||||||
self.driver.get(TWITTER_LOGIN_URL)
|
self.driver.get(TWITTER_LOGIN_URL)
|
||||||
sleep(3)
|
sleep(3)
|
||||||
|
|
||||||
@@ -490,7 +499,7 @@ It may be due to the following:
|
|||||||
retry_button = self.driver.find_element(
|
retry_button = self.driver.find_element(
|
||||||
"xpath", "//span[text()='Retry']/../../..")
|
"xpath", "//span[text()='Retry']/../../..")
|
||||||
self.progress.print_progress(len(self.data), True, retry_cnt, no_tweets_limit)
|
self.progress.print_progress(len(self.data), True, retry_cnt, no_tweets_limit)
|
||||||
sleep(58)
|
sleep(600)
|
||||||
retry_button.click()
|
retry_button.click()
|
||||||
retry_cnt += 1
|
retry_cnt += 1
|
||||||
sleep(2)
|
sleep(2)
|
||||||
|
|||||||
Reference in New Issue
Block a user