Merge pull request #9 from MagiPrince/master
This commit is contained in:
@@ -18,6 +18,7 @@ pip install -r requirements.txt
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
TWITTER_USERNAME=# Your Twitter Handle (e.g. @username)
|
TWITTER_USERNAME=# Your Twitter Handle (e.g. @username)
|
||||||
|
TWITTER_USERNAME=# Your Twitter Username
|
||||||
TWITTER_PASSWORD=# Your Twitter Password
|
TWITTER_PASSWORD=# Your Twitter Password
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -131,6 +132,9 @@ options: description
|
|||||||
and query-based scraping.
|
and query-based scraping.
|
||||||
usage:
|
usage:
|
||||||
python scraper -t 500 -ht=python --top
|
python scraper -t 500 -ht=python --top
|
||||||
|
|
||||||
|
-ntl, --no_tweets_limit : Set no limit to the number of tweets to scrape
|
||||||
|
(will scrap until no more tweets are available).
|
||||||
```
|
```
|
||||||
|
|
||||||
### Sample Scraping Commands
|
### Sample Scraping Commands
|
||||||
|
|||||||
@@ -24,6 +24,13 @@ def main():
|
|||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
parser.add_argument(
|
||||||
|
"--mail",
|
||||||
|
type=str,
|
||||||
|
default=os.getenv("TWITTER_MAIL"),
|
||||||
|
help="Your Twitter mail.",
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--user",
|
"--user",
|
||||||
type=str,
|
type=str,
|
||||||
@@ -65,6 +72,14 @@ def main():
|
|||||||
help="Twitter hashtag. Scrape tweets from a hashtag.",
|
help="Twitter hashtag. Scrape tweets from a hashtag.",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"-ntl",
|
||||||
|
"--no_tweets_limit",
|
||||||
|
nargs='?',
|
||||||
|
default=False,
|
||||||
|
help="Set no limit to the number of tweets to scrape (will scrap until no more tweets are available).",
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-q",
|
"-q",
|
||||||
"--query",
|
"--query",
|
||||||
@@ -95,6 +110,7 @@ def main():
|
|||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
USER_MAIL = args.mail
|
||||||
USER_UNAME = args.user
|
USER_UNAME = args.user
|
||||||
USER_PASSWORD = args.password
|
USER_PASSWORD = args.password
|
||||||
|
|
||||||
@@ -127,12 +143,14 @@ def main():
|
|||||||
|
|
||||||
if USER_UNAME is not None and USER_PASSWORD is not None:
|
if USER_UNAME is not None and USER_PASSWORD is not None:
|
||||||
scraper = Twitter_Scraper(
|
scraper = Twitter_Scraper(
|
||||||
|
mail=USER_MAIL,
|
||||||
username=USER_UNAME,
|
username=USER_UNAME,
|
||||||
password=USER_PASSWORD,
|
password=USER_PASSWORD,
|
||||||
)
|
)
|
||||||
scraper.login()
|
scraper.login()
|
||||||
scraper.scrape_tweets(
|
scraper.scrape_tweets(
|
||||||
max_tweets=args.tweets,
|
max_tweets=args.tweets,
|
||||||
|
no_tweets_limit= args.no_tweets_limit if args.no_tweets_limit is not None else True,
|
||||||
scrape_username=args.username,
|
scrape_username=args.username,
|
||||||
scrape_hashtag=args.hashtag,
|
scrape_hashtag=args.hashtag,
|
||||||
scrape_query=args.query,
|
scrape_query=args.query,
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ class Progress:
|
|||||||
self.total = total
|
self.total = total
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def print_progress(self, current) -> None:
|
def print_progress(self, current, waiting, retry_cnt, no_tweets_limit) -> None:
|
||||||
self.current = current
|
self.current = current
|
||||||
progress = current / self.total
|
progress = current / self.total
|
||||||
bar_length = 40
|
bar_length = 40
|
||||||
@@ -17,6 +17,27 @@ class Progress:
|
|||||||
+ "-" * (bar_length - int(bar_length * progress))
|
+ "-" * (bar_length - int(bar_length * progress))
|
||||||
+ "]"
|
+ "]"
|
||||||
)
|
)
|
||||||
|
if no_tweets_limit:
|
||||||
|
if waiting:
|
||||||
|
sys.stdout.write(
|
||||||
|
"\rTweets scrapped : {} - waiting to access older tweets {} min on 15 min".format(
|
||||||
|
current, retry_cnt
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
sys.stdout.write(
|
||||||
|
"\rTweets scrapped : {} ".format(
|
||||||
|
current
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
if waiting:
|
||||||
|
sys.stdout.write(
|
||||||
|
"\rProgress: [{:<40}] {:.2%} {} of {} - waiting to access older tweets {} min on 15 min".format(
|
||||||
|
progress_bar, progress, current, self.total, retry_cnt
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
sys.stdout.write(
|
sys.stdout.write(
|
||||||
"\rProgress: [{:<40}] {:.2%} {} of {} ".format(
|
"\rProgress: [{:<40}] {:.2%} {} of {} ".format(
|
||||||
progress_bar, progress, current, self.total
|
progress_bar, progress, current, self.total
|
||||||
|
|||||||
@@ -20,7 +20,13 @@ from selenium.webdriver.common.action_chains import ActionChains
|
|||||||
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
||||||
from selenium.webdriver.chrome.service import Service as ChromeService
|
from selenium.webdriver.chrome.service import Service as ChromeService
|
||||||
|
|
||||||
|
from selenium.webdriver.firefox.options import Options as FirefoxOptions
|
||||||
|
from selenium.webdriver.firefox.service import Service as FirefoxService
|
||||||
|
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
|
||||||
from webdriver_manager.chrome import ChromeDriverManager
|
from webdriver_manager.chrome import ChromeDriverManager
|
||||||
|
from webdriver_manager.firefox import GeckoDriverManager
|
||||||
|
|
||||||
TWITTER_LOGIN_URL = "https://twitter.com/i/flow/login"
|
TWITTER_LOGIN_URL = "https://twitter.com/i/flow/login"
|
||||||
|
|
||||||
@@ -28,6 +34,7 @@ TWITTER_LOGIN_URL = "https://twitter.com/i/flow/login"
|
|||||||
class Twitter_Scraper:
|
class Twitter_Scraper:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
mail,
|
||||||
username,
|
username,
|
||||||
password,
|
password,
|
||||||
max_tweets=50,
|
max_tweets=50,
|
||||||
@@ -39,6 +46,7 @@ class Twitter_Scraper:
|
|||||||
scrape_top=False,
|
scrape_top=False,
|
||||||
):
|
):
|
||||||
print("Initializing Twitter Scraper...")
|
print("Initializing Twitter Scraper...")
|
||||||
|
self.mail = mail
|
||||||
self.username = username
|
self.username = username
|
||||||
self.password = password
|
self.password = password
|
||||||
self.interrupted = False
|
self.interrupted = False
|
||||||
@@ -115,7 +123,8 @@ class Twitter_Scraper:
|
|||||||
print("Setup WebDriver...")
|
print("Setup WebDriver...")
|
||||||
header = Headers().generate()["User-Agent"]
|
header = Headers().generate()["User-Agent"]
|
||||||
|
|
||||||
browser_option = ChromeOptions()
|
# browser_option = ChromeOptions()
|
||||||
|
browser_option = FirefoxOptions()
|
||||||
browser_option.add_argument("--no-sandbox")
|
browser_option.add_argument("--no-sandbox")
|
||||||
browser_option.add_argument("--disable-dev-shm-usage")
|
browser_option.add_argument("--disable-dev-shm-usage")
|
||||||
browser_option.add_argument("--ignore-certificate-errors")
|
browser_option.add_argument("--ignore-certificate-errors")
|
||||||
@@ -129,8 +138,13 @@ class Twitter_Scraper:
|
|||||||
browser_option.add_argument("--headless")
|
browser_option.add_argument("--headless")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
print("Initializing ChromeDriver...")
|
# print("Initializing ChromeDriver...")
|
||||||
driver = webdriver.Chrome(
|
# driver = webdriver.Chrome(
|
||||||
|
# options=browser_option,
|
||||||
|
# )
|
||||||
|
|
||||||
|
print("Initializing FirefoxDriver...")
|
||||||
|
driver = webdriver.Firefox(
|
||||||
options=browser_option,
|
options=browser_option,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -138,13 +152,23 @@ class Twitter_Scraper:
|
|||||||
return driver
|
return driver
|
||||||
except WebDriverException:
|
except WebDriverException:
|
||||||
try:
|
try:
|
||||||
print("Downloading ChromeDriver...")
|
# print("Downloading ChromeDriver...")
|
||||||
chromedriver_path = ChromeDriverManager().install()
|
# chromedriver_path = ChromeDriverManager().install()
|
||||||
chrome_service = ChromeService(executable_path=chromedriver_path)
|
# chrome_service = ChromeService(executable_path=chromedriver_path)
|
||||||
|
|
||||||
print("Initializing ChromeDriver...")
|
print("Downloading FirefoxDriver...")
|
||||||
driver = webdriver.Chrome(
|
firefoxdriver_path = GeckoDriverManager().install()
|
||||||
service=chrome_service,
|
firefox_service = FirefoxService(executable_path=firefoxdriver_path)
|
||||||
|
|
||||||
|
# print("Initializing ChromeDriver...")
|
||||||
|
# driver = webdriver.Chrome(
|
||||||
|
# service=chrome_service,
|
||||||
|
# options=browser_option,
|
||||||
|
# )
|
||||||
|
|
||||||
|
print("Initializing FirefoxDriver...")
|
||||||
|
driver = webdriver.Firefox(
|
||||||
|
service=firefox_service,
|
||||||
options=browser_option,
|
options=browser_option,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -206,7 +230,7 @@ class Twitter_Scraper:
|
|||||||
"xpath", "//input[@autocomplete='username']"
|
"xpath", "//input[@autocomplete='username']"
|
||||||
)
|
)
|
||||||
|
|
||||||
username.send_keys(self.username)
|
username.send_keys(self.mail)
|
||||||
username.send_keys(Keys.RETURN)
|
username.send_keys(Keys.RETURN)
|
||||||
sleep(3)
|
sleep(3)
|
||||||
break
|
break
|
||||||
@@ -345,6 +369,7 @@ It may be due to the following:
|
|||||||
def scrape_tweets(
|
def scrape_tweets(
|
||||||
self,
|
self,
|
||||||
max_tweets=50,
|
max_tweets=50,
|
||||||
|
no_tweets_limit=False,
|
||||||
scrape_username=None,
|
scrape_username=None,
|
||||||
scrape_hashtag=None,
|
scrape_hashtag=None,
|
||||||
scrape_query=None,
|
scrape_query=None,
|
||||||
@@ -387,11 +412,20 @@ It may be due to the following:
|
|||||||
elif self.scraper_details["type"] == "Home":
|
elif self.scraper_details["type"] == "Home":
|
||||||
print("Scraping Tweets from Home...")
|
print("Scraping Tweets from Home...")
|
||||||
|
|
||||||
self.progress.print_progress(0)
|
# Accept cookies to make the banner disappear
|
||||||
|
try:
|
||||||
|
accept_cookies_btn = self.driver.find_element(
|
||||||
|
"xpath", "//span[text()='Refuse non-essential cookies']/../../..")
|
||||||
|
accept_cookies_btn.click()
|
||||||
|
except NoSuchElementException:
|
||||||
|
pass
|
||||||
|
|
||||||
|
self.progress.print_progress(0, False, 0, no_tweets_limit)
|
||||||
|
|
||||||
refresh_count = 0
|
refresh_count = 0
|
||||||
added_tweets = 0
|
added_tweets = 0
|
||||||
empty_count = 0
|
empty_count = 0
|
||||||
|
retry_cnt = 0
|
||||||
|
|
||||||
while self.scroller.scrolling:
|
while self.scroller.scrolling:
|
||||||
try:
|
try:
|
||||||
@@ -424,9 +458,9 @@ It may be due to the following:
|
|||||||
if not tweet.is_ad:
|
if not tweet.is_ad:
|
||||||
self.data.append(tweet.tweet)
|
self.data.append(tweet.tweet)
|
||||||
added_tweets += 1
|
added_tweets += 1
|
||||||
self.progress.print_progress(len(self.data))
|
self.progress.print_progress(len(self.data), False, 0, no_tweets_limit)
|
||||||
|
|
||||||
if len(self.data) >= self.max_tweets:
|
if len(self.data) >= self.max_tweets and not no_tweets_limit:
|
||||||
self.scroller.scrolling = False
|
self.scroller.scrolling = False
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
@@ -440,10 +474,25 @@ It may be due to the following:
|
|||||||
except NoSuchElementException:
|
except NoSuchElementException:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if len(self.data) >= self.max_tweets:
|
if len(self.data) >= self.max_tweets and not no_tweets_limit:
|
||||||
break
|
break
|
||||||
|
|
||||||
if added_tweets == 0:
|
if added_tweets == 0:
|
||||||
|
# Check if there is a button "Retry" and click on it with a regular basis until a certain amount of tries
|
||||||
|
try:
|
||||||
|
while retry_cnt < 15:
|
||||||
|
retry_button = self.driver.find_element(
|
||||||
|
"xpath", "//span[text()='Retry']/../../..")
|
||||||
|
self.progress.print_progress(len(self.data), True, retry_cnt, no_tweets_limit)
|
||||||
|
sleep(58)
|
||||||
|
retry_button.click()
|
||||||
|
retry_cnt += 1
|
||||||
|
sleep(2)
|
||||||
|
# There is no Retry button so the counter is reseted
|
||||||
|
except NoSuchElementException:
|
||||||
|
retry_cnt = 0
|
||||||
|
self.progress.print_progress(len(self.data), False, 0, no_tweets_limit)
|
||||||
|
|
||||||
if empty_count >= 5:
|
if empty_count >= 5:
|
||||||
if refresh_count >= 3:
|
if refresh_count >= 3:
|
||||||
print()
|
print()
|
||||||
@@ -470,11 +519,12 @@ It may be due to the following:
|
|||||||
|
|
||||||
print("")
|
print("")
|
||||||
|
|
||||||
if len(self.data) >= self.max_tweets:
|
if len(self.data) >= self.max_tweets or no_tweets_limit:
|
||||||
print("Scraping Complete")
|
print("Scraping Complete")
|
||||||
else:
|
else:
|
||||||
print("Scraping Incomplete")
|
print("Scraping Incomplete")
|
||||||
|
|
||||||
|
if not no_tweets_limit:
|
||||||
print("Tweets: {} out of {}\n".format(len(self.data), self.max_tweets))
|
print("Tweets: {} out of {}\n".format(len(self.data), self.max_tweets))
|
||||||
|
|
||||||
pass
|
pass
|
||||||
|
|||||||
Reference in New Issue
Block a user