adding the twitter username utilisation if asked, and the possibility of scrap tweets with no limits (even tho the limitation will be the numbers of tweets that the browser is able to handle)
This commit is contained in:
@@ -18,6 +18,7 @@ pip install -r requirements.txt
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
TWITTER_USERNAME=# Your Twitter Handle (e.g. @username)
|
TWITTER_USERNAME=# Your Twitter Handle (e.g. @username)
|
||||||
|
TWITTER_USERNAME=# Your Twitter Username
|
||||||
TWITTER_PASSWORD=# Your Twitter Password
|
TWITTER_PASSWORD=# Your Twitter Password
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -131,6 +132,9 @@ options: description
|
|||||||
and query-based scraping.
|
and query-based scraping.
|
||||||
usage:
|
usage:
|
||||||
python scraper -t 500 -ht=python --top
|
python scraper -t 500 -ht=python --top
|
||||||
|
|
||||||
|
-ntl, --no_tweets_limit : Set no limit to the number of tweets to scrape
|
||||||
|
(will scrap until no more tweets are available).
|
||||||
```
|
```
|
||||||
|
|
||||||
### Sample Scraping Commands
|
### Sample Scraping Commands
|
||||||
|
|||||||
@@ -24,6 +24,13 @@ def main():
|
|||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
parser.add_argument(
|
||||||
|
"--mail",
|
||||||
|
type=str,
|
||||||
|
default=os.getenv("TWITTER_MAIL"),
|
||||||
|
help="Your Twitter mail.",
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--user",
|
"--user",
|
||||||
type=str,
|
type=str,
|
||||||
@@ -65,6 +72,14 @@ def main():
|
|||||||
help="Twitter hashtag. Scrape tweets from a hashtag.",
|
help="Twitter hashtag. Scrape tweets from a hashtag.",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"-ntl",
|
||||||
|
"--no_tweets_limit",
|
||||||
|
nargs='?',
|
||||||
|
default=False,
|
||||||
|
help="Set no limit to the number of tweets to scrape (will scrap until no more tweets are available).",
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-q",
|
"-q",
|
||||||
"--query",
|
"--query",
|
||||||
@@ -95,6 +110,7 @@ def main():
|
|||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
USER_MAIL = args.mail
|
||||||
USER_UNAME = args.user
|
USER_UNAME = args.user
|
||||||
USER_PASSWORD = args.password
|
USER_PASSWORD = args.password
|
||||||
|
|
||||||
@@ -127,12 +143,14 @@ def main():
|
|||||||
|
|
||||||
if USER_UNAME is not None and USER_PASSWORD is not None:
|
if USER_UNAME is not None and USER_PASSWORD is not None:
|
||||||
scraper = Twitter_Scraper(
|
scraper = Twitter_Scraper(
|
||||||
|
mail=USER_MAIL,
|
||||||
username=USER_UNAME,
|
username=USER_UNAME,
|
||||||
password=USER_PASSWORD,
|
password=USER_PASSWORD,
|
||||||
)
|
)
|
||||||
scraper.login()
|
scraper.login()
|
||||||
scraper.scrape_tweets(
|
scraper.scrape_tweets(
|
||||||
max_tweets=args.tweets,
|
max_tweets=args.tweets,
|
||||||
|
no_tweets_limit= args.no_tweets_limit if args.no_tweets_limit is not None else True,
|
||||||
scrape_username=args.username,
|
scrape_username=args.username,
|
||||||
scrape_hashtag=args.hashtag,
|
scrape_hashtag=args.hashtag,
|
||||||
scrape_query=args.query,
|
scrape_query=args.query,
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ class Progress:
|
|||||||
self.total = total
|
self.total = total
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def print_progress(self, current) -> None:
|
def print_progress(self, current, waiting, retry_cnt, no_tweets_limit) -> None:
|
||||||
self.current = current
|
self.current = current
|
||||||
progress = current / self.total
|
progress = current / self.total
|
||||||
bar_length = 40
|
bar_length = 40
|
||||||
@@ -17,8 +17,29 @@ class Progress:
|
|||||||
+ "-" * (bar_length - int(bar_length * progress))
|
+ "-" * (bar_length - int(bar_length * progress))
|
||||||
+ "]"
|
+ "]"
|
||||||
)
|
)
|
||||||
|
if no_tweets_limit:
|
||||||
|
if waiting:
|
||||||
sys.stdout.write(
|
sys.stdout.write(
|
||||||
"\rProgress: [{:<40}] {:.2%} {} of {}".format(
|
"\rTweets scrapped : {} - waiting to access older tweets {} min on 15 min".format(
|
||||||
|
current, retry_cnt
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
sys.stdout.write(
|
||||||
|
"\rTweets scrapped : {} ".format(
|
||||||
|
current
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
if waiting:
|
||||||
|
sys.stdout.write(
|
||||||
|
"\rProgress: [{:<40}] {:.2%} {} of {} - waiting to access older tweets {} min on 15 min".format(
|
||||||
|
progress_bar, progress, current, self.total, retry_cnt
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
sys.stdout.write(
|
||||||
|
"\rProgress: [{:<40}] {:.2%} {} of {} ".format(
|
||||||
progress_bar, progress, current, self.total
|
progress_bar, progress, current, self.total
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -20,7 +20,13 @@ from selenium.webdriver.common.action_chains import ActionChains
|
|||||||
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
||||||
from selenium.webdriver.chrome.service import Service as ChromeService
|
from selenium.webdriver.chrome.service import Service as ChromeService
|
||||||
|
|
||||||
|
from selenium.webdriver.firefox.options import Options as FirefoxOptions
|
||||||
|
from selenium.webdriver.firefox.service import Service as FirefoxService
|
||||||
|
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
|
||||||
from webdriver_manager.chrome import ChromeDriverManager
|
from webdriver_manager.chrome import ChromeDriverManager
|
||||||
|
from webdriver_manager.firefox import GeckoDriverManager
|
||||||
|
|
||||||
TWITTER_LOGIN_URL = "https://twitter.com/i/flow/login"
|
TWITTER_LOGIN_URL = "https://twitter.com/i/flow/login"
|
||||||
|
|
||||||
@@ -28,6 +34,7 @@ TWITTER_LOGIN_URL = "https://twitter.com/i/flow/login"
|
|||||||
class Twitter_Scraper:
|
class Twitter_Scraper:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
mail,
|
||||||
username,
|
username,
|
||||||
password,
|
password,
|
||||||
max_tweets=50,
|
max_tweets=50,
|
||||||
@@ -39,6 +46,7 @@ class Twitter_Scraper:
|
|||||||
scrape_top=False,
|
scrape_top=False,
|
||||||
):
|
):
|
||||||
print("Initializing Twitter Scraper...")
|
print("Initializing Twitter Scraper...")
|
||||||
|
self.mail = mail
|
||||||
self.username = username
|
self.username = username
|
||||||
self.password = password
|
self.password = password
|
||||||
self.interrupted = False
|
self.interrupted = False
|
||||||
@@ -115,7 +123,8 @@ class Twitter_Scraper:
|
|||||||
print("Setup WebDriver...")
|
print("Setup WebDriver...")
|
||||||
header = Headers().generate()["User-Agent"]
|
header = Headers().generate()["User-Agent"]
|
||||||
|
|
||||||
browser_option = ChromeOptions()
|
# browser_option = ChromeOptions()
|
||||||
|
browser_option = FirefoxOptions()
|
||||||
browser_option.add_argument("--no-sandbox")
|
browser_option.add_argument("--no-sandbox")
|
||||||
browser_option.add_argument("--disable-dev-shm-usage")
|
browser_option.add_argument("--disable-dev-shm-usage")
|
||||||
browser_option.add_argument("--ignore-certificate-errors")
|
browser_option.add_argument("--ignore-certificate-errors")
|
||||||
@@ -129,8 +138,13 @@ class Twitter_Scraper:
|
|||||||
browser_option.add_argument("--headless")
|
browser_option.add_argument("--headless")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
print("Initializing ChromeDriver...")
|
# print("Initializing ChromeDriver...")
|
||||||
driver = webdriver.Chrome(
|
# driver = webdriver.Chrome(
|
||||||
|
# options=browser_option,
|
||||||
|
# )
|
||||||
|
|
||||||
|
print("Initializing FirefoxDriver...")
|
||||||
|
driver = webdriver.Firefox(
|
||||||
options=browser_option,
|
options=browser_option,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -138,13 +152,23 @@ class Twitter_Scraper:
|
|||||||
return driver
|
return driver
|
||||||
except WebDriverException:
|
except WebDriverException:
|
||||||
try:
|
try:
|
||||||
print("Downloading ChromeDriver...")
|
# print("Downloading ChromeDriver...")
|
||||||
chromedriver_path = ChromeDriverManager().install()
|
# chromedriver_path = ChromeDriverManager().install()
|
||||||
chrome_service = ChromeService(executable_path=chromedriver_path)
|
# chrome_service = ChromeService(executable_path=chromedriver_path)
|
||||||
|
|
||||||
print("Initializing ChromeDriver...")
|
print("Downloading FirefoxDriver...")
|
||||||
driver = webdriver.Chrome(
|
firefoxdriver_path = GeckoDriverManager().install()
|
||||||
service=chrome_service,
|
firefox_service = FirefoxService(executable_path=firefoxdriver_path)
|
||||||
|
|
||||||
|
# print("Initializing ChromeDriver...")
|
||||||
|
# driver = webdriver.Chrome(
|
||||||
|
# service=chrome_service,
|
||||||
|
# options=browser_option,
|
||||||
|
# )
|
||||||
|
|
||||||
|
print("Initializing FirefoxDriver...")
|
||||||
|
driver = webdriver.Firefox(
|
||||||
|
service=firefox_service,
|
||||||
options=browser_option,
|
options=browser_option,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -206,7 +230,7 @@ class Twitter_Scraper:
|
|||||||
"xpath", "//input[@autocomplete='username']"
|
"xpath", "//input[@autocomplete='username']"
|
||||||
)
|
)
|
||||||
|
|
||||||
username.send_keys(self.username)
|
username.send_keys(self.mail)
|
||||||
username.send_keys(Keys.RETURN)
|
username.send_keys(Keys.RETURN)
|
||||||
sleep(3)
|
sleep(3)
|
||||||
break
|
break
|
||||||
@@ -315,10 +339,12 @@ It may be due to the following:
|
|||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
else:
|
else:
|
||||||
url = f"https://twitter.com/search?q={self.scraper_details['query']}&src=typed_query"
|
url = f"https://twitter.com/search?q={self.scraper_details['query']}&src=typed_query"
|
||||||
|
print(url)
|
||||||
if self.scraper_details["tab"] == "Latest":
|
if self.scraper_details["tab"] == "Latest":
|
||||||
url += "&f=live"
|
url += "&f=live"
|
||||||
|
|
||||||
self.driver.get(url)
|
self.driver.get(url)
|
||||||
|
self.driver.save_screenshot('screenshot5.png')
|
||||||
sleep(3)
|
sleep(3)
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -345,6 +371,7 @@ It may be due to the following:
|
|||||||
def scrape_tweets(
|
def scrape_tweets(
|
||||||
self,
|
self,
|
||||||
max_tweets=50,
|
max_tweets=50,
|
||||||
|
no_tweets_limit=False,
|
||||||
scrape_username=None,
|
scrape_username=None,
|
||||||
scrape_hashtag=None,
|
scrape_hashtag=None,
|
||||||
scrape_query=None,
|
scrape_query=None,
|
||||||
@@ -387,11 +414,20 @@ It may be due to the following:
|
|||||||
elif self.scraper_details["type"] == "Home":
|
elif self.scraper_details["type"] == "Home":
|
||||||
print("Scraping Tweets from Home...")
|
print("Scraping Tweets from Home...")
|
||||||
|
|
||||||
self.progress.print_progress(0)
|
# Accept cookies to make the banner disappear
|
||||||
|
try:
|
||||||
|
accept_cookies_btn = self.driver.find_element(
|
||||||
|
"xpath", "//span[text()='Refuse non-essential cookies']/../../..")
|
||||||
|
accept_cookies_btn.click()
|
||||||
|
except NoSuchElementException:
|
||||||
|
pass
|
||||||
|
|
||||||
|
self.progress.print_progress(0, False, 0, no_tweets_limit)
|
||||||
|
|
||||||
refresh_count = 0
|
refresh_count = 0
|
||||||
added_tweets = 0
|
added_tweets = 0
|
||||||
empty_count = 0
|
empty_count = 0
|
||||||
|
retry_cnt = 0
|
||||||
|
|
||||||
while self.scroller.scrolling:
|
while self.scroller.scrolling:
|
||||||
try:
|
try:
|
||||||
@@ -424,9 +460,9 @@ It may be due to the following:
|
|||||||
if not tweet.is_ad:
|
if not tweet.is_ad:
|
||||||
self.data.append(tweet.tweet)
|
self.data.append(tweet.tweet)
|
||||||
added_tweets += 1
|
added_tweets += 1
|
||||||
self.progress.print_progress(len(self.data))
|
self.progress.print_progress(len(self.data), False, 0, no_tweets_limit)
|
||||||
|
|
||||||
if len(self.data) >= self.max_tweets:
|
if len(self.data) >= self.max_tweets and not no_tweets_limit:
|
||||||
self.scroller.scrolling = False
|
self.scroller.scrolling = False
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
@@ -440,10 +476,25 @@ It may be due to the following:
|
|||||||
except NoSuchElementException:
|
except NoSuchElementException:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if len(self.data) >= self.max_tweets:
|
if len(self.data) >= self.max_tweets and not no_tweets_limit:
|
||||||
break
|
break
|
||||||
|
|
||||||
if added_tweets == 0:
|
if added_tweets == 0:
|
||||||
|
# Check if there is a button "Retry" and click on it with a regular basis until a certain amount of tries
|
||||||
|
try:
|
||||||
|
while retry_cnt < 15:
|
||||||
|
retry_button = self.driver.find_element(
|
||||||
|
"xpath", "//span[text()='Retry']/../../..")
|
||||||
|
self.progress.print_progress(len(self.data), True, retry_cnt, no_tweets_limit)
|
||||||
|
sleep(58)
|
||||||
|
retry_button.click()
|
||||||
|
retry_cnt += 1
|
||||||
|
sleep(2)
|
||||||
|
# There is no Retry button so the counter is reseted
|
||||||
|
except NoSuchElementException:
|
||||||
|
retry_cnt = 0
|
||||||
|
self.progress.print_progress(len(self.data), False, 0, no_tweets_limit)
|
||||||
|
|
||||||
if empty_count >= 5:
|
if empty_count >= 5:
|
||||||
if refresh_count >= 3:
|
if refresh_count >= 3:
|
||||||
print()
|
print()
|
||||||
@@ -470,11 +521,12 @@ It may be due to the following:
|
|||||||
|
|
||||||
print("")
|
print("")
|
||||||
|
|
||||||
if len(self.data) >= self.max_tweets:
|
if len(self.data) >= self.max_tweets or no_tweets_limit:
|
||||||
print("Scraping Complete")
|
print("Scraping Complete")
|
||||||
else:
|
else:
|
||||||
print("Scraping Incomplete")
|
print("Scraping Incomplete")
|
||||||
|
|
||||||
|
if not no_tweets_limit:
|
||||||
print("Tweets: {} out of {}\n".format(len(self.data), self.max_tweets))
|
print("Tweets: {} out of {}\n".format(len(self.data), self.max_tweets))
|
||||||
|
|
||||||
pass
|
pass
|
||||||
|
|||||||
Reference in New Issue
Block a user