Merge pull request #9 from MagiPrince/master
This commit is contained in:
@@ -18,6 +18,7 @@ pip install -r requirements.txt
|
||||
|
||||
```bash
|
||||
TWITTER_USERNAME=# Your Twitter Handle (e.g. @username)
|
||||
TWITTER_USERNAME=# Your Twitter Username
|
||||
TWITTER_PASSWORD=# Your Twitter Password
|
||||
```
|
||||
|
||||
@@ -131,6 +132,9 @@ options: description
|
||||
and query-based scraping.
|
||||
usage:
|
||||
python scraper -t 500 -ht=python --top
|
||||
|
||||
-ntl, --no_tweets_limit : Set no limit to the number of tweets to scrape
|
||||
(will scrap until no more tweets are available).
|
||||
```
|
||||
|
||||
### Sample Scraping Commands
|
||||
|
||||
@@ -24,6 +24,13 @@ def main():
|
||||
)
|
||||
|
||||
try:
|
||||
parser.add_argument(
|
||||
"--mail",
|
||||
type=str,
|
||||
default=os.getenv("TWITTER_MAIL"),
|
||||
help="Your Twitter mail.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--user",
|
||||
type=str,
|
||||
@@ -65,6 +72,14 @@ def main():
|
||||
help="Twitter hashtag. Scrape tweets from a hashtag.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-ntl",
|
||||
"--no_tweets_limit",
|
||||
nargs='?',
|
||||
default=False,
|
||||
help="Set no limit to the number of tweets to scrape (will scrap until no more tweets are available).",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-q",
|
||||
"--query",
|
||||
@@ -95,6 +110,7 @@ def main():
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
USER_MAIL = args.mail
|
||||
USER_UNAME = args.user
|
||||
USER_PASSWORD = args.password
|
||||
|
||||
@@ -127,12 +143,14 @@ def main():
|
||||
|
||||
if USER_UNAME is not None and USER_PASSWORD is not None:
|
||||
scraper = Twitter_Scraper(
|
||||
mail=USER_MAIL,
|
||||
username=USER_UNAME,
|
||||
password=USER_PASSWORD,
|
||||
)
|
||||
scraper.login()
|
||||
scraper.scrape_tweets(
|
||||
max_tweets=args.tweets,
|
||||
no_tweets_limit= args.no_tweets_limit if args.no_tweets_limit is not None else True,
|
||||
scrape_username=args.username,
|
||||
scrape_hashtag=args.hashtag,
|
||||
scrape_query=args.query,
|
||||
|
||||
@@ -7,7 +7,7 @@ class Progress:
|
||||
self.total = total
|
||||
pass
|
||||
|
||||
def print_progress(self, current) -> None:
|
||||
def print_progress(self, current, waiting, retry_cnt, no_tweets_limit) -> None:
|
||||
self.current = current
|
||||
progress = current / self.total
|
||||
bar_length = 40
|
||||
@@ -17,8 +17,29 @@ class Progress:
|
||||
+ "-" * (bar_length - int(bar_length * progress))
|
||||
+ "]"
|
||||
)
|
||||
if no_tweets_limit:
|
||||
if waiting:
|
||||
sys.stdout.write(
|
||||
"\rProgress: [{:<40}] {:.2%} {} of {}".format(
|
||||
"\rTweets scrapped : {} - waiting to access older tweets {} min on 15 min".format(
|
||||
current, retry_cnt
|
||||
)
|
||||
)
|
||||
else:
|
||||
sys.stdout.write(
|
||||
"\rTweets scrapped : {} ".format(
|
||||
current
|
||||
)
|
||||
)
|
||||
else:
|
||||
if waiting:
|
||||
sys.stdout.write(
|
||||
"\rProgress: [{:<40}] {:.2%} {} of {} - waiting to access older tweets {} min on 15 min".format(
|
||||
progress_bar, progress, current, self.total, retry_cnt
|
||||
)
|
||||
)
|
||||
else:
|
||||
sys.stdout.write(
|
||||
"\rProgress: [{:<40}] {:.2%} {} of {} ".format(
|
||||
progress_bar, progress, current, self.total
|
||||
)
|
||||
)
|
||||
|
||||
@@ -20,7 +20,13 @@ from selenium.webdriver.common.action_chains import ActionChains
|
||||
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
||||
from selenium.webdriver.chrome.service import Service as ChromeService
|
||||
|
||||
from selenium.webdriver.firefox.options import Options as FirefoxOptions
|
||||
from selenium.webdriver.firefox.service import Service as FirefoxService
|
||||
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
from webdriver_manager.firefox import GeckoDriverManager
|
||||
|
||||
TWITTER_LOGIN_URL = "https://twitter.com/i/flow/login"
|
||||
|
||||
@@ -28,6 +34,7 @@ TWITTER_LOGIN_URL = "https://twitter.com/i/flow/login"
|
||||
class Twitter_Scraper:
|
||||
def __init__(
|
||||
self,
|
||||
mail,
|
||||
username,
|
||||
password,
|
||||
max_tweets=50,
|
||||
@@ -39,6 +46,7 @@ class Twitter_Scraper:
|
||||
scrape_top=False,
|
||||
):
|
||||
print("Initializing Twitter Scraper...")
|
||||
self.mail = mail
|
||||
self.username = username
|
||||
self.password = password
|
||||
self.interrupted = False
|
||||
@@ -115,7 +123,8 @@ class Twitter_Scraper:
|
||||
print("Setup WebDriver...")
|
||||
header = Headers().generate()["User-Agent"]
|
||||
|
||||
browser_option = ChromeOptions()
|
||||
# browser_option = ChromeOptions()
|
||||
browser_option = FirefoxOptions()
|
||||
browser_option.add_argument("--no-sandbox")
|
||||
browser_option.add_argument("--disable-dev-shm-usage")
|
||||
browser_option.add_argument("--ignore-certificate-errors")
|
||||
@@ -129,8 +138,13 @@ class Twitter_Scraper:
|
||||
browser_option.add_argument("--headless")
|
||||
|
||||
try:
|
||||
print("Initializing ChromeDriver...")
|
||||
driver = webdriver.Chrome(
|
||||
# print("Initializing ChromeDriver...")
|
||||
# driver = webdriver.Chrome(
|
||||
# options=browser_option,
|
||||
# )
|
||||
|
||||
print("Initializing FirefoxDriver...")
|
||||
driver = webdriver.Firefox(
|
||||
options=browser_option,
|
||||
)
|
||||
|
||||
@@ -138,13 +152,23 @@ class Twitter_Scraper:
|
||||
return driver
|
||||
except WebDriverException:
|
||||
try:
|
||||
print("Downloading ChromeDriver...")
|
||||
chromedriver_path = ChromeDriverManager().install()
|
||||
chrome_service = ChromeService(executable_path=chromedriver_path)
|
||||
# print("Downloading ChromeDriver...")
|
||||
# chromedriver_path = ChromeDriverManager().install()
|
||||
# chrome_service = ChromeService(executable_path=chromedriver_path)
|
||||
|
||||
print("Initializing ChromeDriver...")
|
||||
driver = webdriver.Chrome(
|
||||
service=chrome_service,
|
||||
print("Downloading FirefoxDriver...")
|
||||
firefoxdriver_path = GeckoDriverManager().install()
|
||||
firefox_service = FirefoxService(executable_path=firefoxdriver_path)
|
||||
|
||||
# print("Initializing ChromeDriver...")
|
||||
# driver = webdriver.Chrome(
|
||||
# service=chrome_service,
|
||||
# options=browser_option,
|
||||
# )
|
||||
|
||||
print("Initializing FirefoxDriver...")
|
||||
driver = webdriver.Firefox(
|
||||
service=firefox_service,
|
||||
options=browser_option,
|
||||
)
|
||||
|
||||
@@ -206,7 +230,7 @@ class Twitter_Scraper:
|
||||
"xpath", "//input[@autocomplete='username']"
|
||||
)
|
||||
|
||||
username.send_keys(self.username)
|
||||
username.send_keys(self.mail)
|
||||
username.send_keys(Keys.RETURN)
|
||||
sleep(3)
|
||||
break
|
||||
@@ -345,6 +369,7 @@ It may be due to the following:
|
||||
def scrape_tweets(
|
||||
self,
|
||||
max_tweets=50,
|
||||
no_tweets_limit=False,
|
||||
scrape_username=None,
|
||||
scrape_hashtag=None,
|
||||
scrape_query=None,
|
||||
@@ -387,11 +412,20 @@ It may be due to the following:
|
||||
elif self.scraper_details["type"] == "Home":
|
||||
print("Scraping Tweets from Home...")
|
||||
|
||||
self.progress.print_progress(0)
|
||||
# Accept cookies to make the banner disappear
|
||||
try:
|
||||
accept_cookies_btn = self.driver.find_element(
|
||||
"xpath", "//span[text()='Refuse non-essential cookies']/../../..")
|
||||
accept_cookies_btn.click()
|
||||
except NoSuchElementException:
|
||||
pass
|
||||
|
||||
self.progress.print_progress(0, False, 0, no_tweets_limit)
|
||||
|
||||
refresh_count = 0
|
||||
added_tweets = 0
|
||||
empty_count = 0
|
||||
retry_cnt = 0
|
||||
|
||||
while self.scroller.scrolling:
|
||||
try:
|
||||
@@ -424,9 +458,9 @@ It may be due to the following:
|
||||
if not tweet.is_ad:
|
||||
self.data.append(tweet.tweet)
|
||||
added_tweets += 1
|
||||
self.progress.print_progress(len(self.data))
|
||||
self.progress.print_progress(len(self.data), False, 0, no_tweets_limit)
|
||||
|
||||
if len(self.data) >= self.max_tweets:
|
||||
if len(self.data) >= self.max_tweets and not no_tweets_limit:
|
||||
self.scroller.scrolling = False
|
||||
break
|
||||
else:
|
||||
@@ -440,10 +474,25 @@ It may be due to the following:
|
||||
except NoSuchElementException:
|
||||
continue
|
||||
|
||||
if len(self.data) >= self.max_tweets:
|
||||
if len(self.data) >= self.max_tweets and not no_tweets_limit:
|
||||
break
|
||||
|
||||
if added_tweets == 0:
|
||||
# Check if there is a button "Retry" and click on it with a regular basis until a certain amount of tries
|
||||
try:
|
||||
while retry_cnt < 15:
|
||||
retry_button = self.driver.find_element(
|
||||
"xpath", "//span[text()='Retry']/../../..")
|
||||
self.progress.print_progress(len(self.data), True, retry_cnt, no_tweets_limit)
|
||||
sleep(58)
|
||||
retry_button.click()
|
||||
retry_cnt += 1
|
||||
sleep(2)
|
||||
# There is no Retry button so the counter is reseted
|
||||
except NoSuchElementException:
|
||||
retry_cnt = 0
|
||||
self.progress.print_progress(len(self.data), False, 0, no_tweets_limit)
|
||||
|
||||
if empty_count >= 5:
|
||||
if refresh_count >= 3:
|
||||
print()
|
||||
@@ -470,11 +519,12 @@ It may be due to the following:
|
||||
|
||||
print("")
|
||||
|
||||
if len(self.data) >= self.max_tweets:
|
||||
if len(self.data) >= self.max_tweets or no_tweets_limit:
|
||||
print("Scraping Complete")
|
||||
else:
|
||||
print("Scraping Incomplete")
|
||||
|
||||
if not no_tweets_limit:
|
||||
print("Tweets: {} out of {}\n".format(len(self.data), self.max_tweets))
|
||||
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user