feat: scrape user profile, hashtag, search or query or advanced search, latest tweets, or top tweets

This commit is contained in:
Jarrian
2023-09-09 21:10:44 +08:00
parent 07ece4e29a
commit ed0df7a0a2
3 changed files with 262 additions and 21 deletions

View File

@@ -18,6 +18,12 @@ TWITTER_PASSWORD=# Your Twitter Password
## Usage
- Show Help
```bash
python scraper --help
```
- Basic usage
```bash
@@ -27,15 +33,90 @@ python scraper
- Setting maximum number of tweets. defaults to `50`.
```bash
python scraper --tweets=500 # Scrape 500 Tweets
python scraper --tweets=500 # Scrape 500 Tweets
```
### Options and Arguments
- Options and Arguments
```bash
usage: python scraper [arg]
usage: python scraper [option] ... [arg] ...
Arguments Description
--tweets : No. of tweets. default: 50.
e.g. --tweets=500
options: description
-t, --tweets : Number of tweets to scrape (default: 50).
e.g.
-t 500
--tweets=500
-u, --username : Twitter username.
Scrape tweets from a user's profile.
e.g.
-u elonmusk
--username=@elonmusk
-ht, --hashtag : Twitter hashtag.
Scrape tweets from a hashtag.
e.g.
-ht javascript
--hashtag=javascript
-q, --query : Twitter query or search.
Scrape tweets from a query or search.
e.g.
-q "Philippine Marites"
--query="Jak Roberto anti selos"
--latest : Twitter latest tweets (default: True).
Note: Only for hashtag-based
and query-based scraping.
usage:
python scraper -t 500 -ht=python --latest
--top : Twitter top tweets (default: False).
Note: Only for hashtag-based
and query-based scraping.
usage:
python scraper -t 500 -ht=python --top
```
### Sample Scraping Commands
- **Custom Limit Scraping**
```bash
python scraper -t 500
```
- **User Profile Scraping**
```bash
python scraper -t 100 -u elonmusk
```
- **Hashtag Scraping**
- Latest
```bash
python scraper -t 100 -ht python --latest
```
- Top
```bash
python scraper -t 100 -ht python --top
```
- **Query or Search Scraping**
_(Also works with twitter advanced search.)_
- Latest
```bash
python scraper -t 100 -q "Jak Roberto Anti Selos" --latest
```
- Top
```bash
python scraper -t 100 -q "International News" --top
```

View File

@@ -34,18 +34,85 @@ def main():
print()
parser = argparse.ArgumentParser(description="Twitter Scraper")
parser = argparse.ArgumentParser(
add_help=True,
usage="python scraper [option] ... [arg] ...",
description="Twitter Scraper is a tool that allows you to scrape tweets from twitter without using Twitter's API.",
)
parser.add_argument(
"-t",
"--tweets",
type=int,
default=50,
help="Number of tweets to scrape (default: 50)",
)
parser.add_argument(
"-u",
"--username",
type=str,
default=None,
help="Twitter username. Scrape tweets from a user's profile.",
)
parser.add_argument(
"-ht",
"--hashtag",
type=str,
default=None,
help="Twitter hashtag. Scrape tweets from a hashtag.",
)
parser.add_argument(
"-q",
"--query",
type=str,
default=None,
help="Twitter query or search. Scrape tweets from a query or search.",
)
parser.add_argument(
"--latest",
action="store_true",
help="Scrape latest tweets",
)
parser.add_argument(
"--top",
action="store_true",
help="Scrape top tweets",
)
args = parser.parse_args()
tweet_type_args = []
if args.username is not None:
tweet_type_args.append(args.username)
if args.hashtag is not None:
tweet_type_args.append(args.hashtag)
if args.query is not None:
tweet_type_args.append(args.query)
if len(tweet_type_args) > 1:
print("Please specify only one of --username, --hashtag, or --query.")
sys.exit(1)
if args.latest and args.top:
print("Please specify either --latest or --top. Not both.")
sys.exit(1)
if USER_UNAME is not None and USER_PASSWORD is not None:
scraper = Twitter_Scraper(
username=USER_UNAME, password=USER_PASSWORD, max_tweets=args.tweets
username=USER_UNAME,
password=USER_PASSWORD,
max_tweets=args.tweets,
scrape_username=args.username,
scrape_hashtag=args.hashtag,
scrape_query=args.query,
scrape_latest=args.latest,
scrape_top=args.top,
)
scraper.scrape_tweets()

View File

@@ -25,11 +25,31 @@ TWITTER_LOGIN_URL = "https://twitter.com/i/flow/login"
class Twitter_Scraper:
def __init__(self, username, password, max_tweets=50):
def __init__(
self,
username,
password,
max_tweets=50,
scrape_username=None,
scrape_hashtag=None,
scrape_query=None,
scrape_latest=True,
scrape_top=False,
):
print("Initializing Twitter Scraper...")
self.username = username
self.password = password
self.data = []
self.scraper_details = {
"type": None,
"username": scrape_username,
"hashtag": str(scrape_hashtag).replace("#", "")
if scrape_hashtag is not None
else None,
"query": scrape_query,
"tab": "Latest" if scrape_latest else "Top" if scrape_top else "Latest",
}
self.router = self.go_to_home
self.tweet_ids = set()
self.max_tweets = max_tweets
self.progress = Progress(0, max_tweets)
@@ -38,6 +58,19 @@ class Twitter_Scraper:
self.scroller = Scroller(self.driver)
self._login()
if scrape_username is not None:
self.scraper_details["type"] = "Username"
self.router = self.go_to_profile
elif scrape_hashtag is not None:
self.scraper_details["type"] = "Hashtag"
self.router = self.go_to_hashtag
elif scrape_query is not None:
self.scraper_details["type"] = "Query"
self.router = self.go_to_search
else:
self.scraper_details["type"] = "Home"
self.router = self.go_to_home
def _get_driver(self):
print("Setup WebDriver...")
header = Headers().generate()["User-Agent"]
@@ -177,33 +210,84 @@ It may be due to the following:
sleep(3)
pass
def go_to_profile(self):
self.driver.get(f"https://twitter.com/{self.scraper_details['username']}")
sleep(3)
pass
def go_to_hashtag(self):
url = f"https://twitter.com/hashtag/{self.scraper_details['hashtag']}?src=hashtag_click"
if self.scraper_details["tab"] == "Latest":
url += "&f=live"
self.driver.get(url)
sleep(3)
pass
def go_to_search(self):
url = f"https://twitter.com/search?q={self.scraper_details['query']}&src=typed_query"
if self.scraper_details["tab"] == "Latest":
url += "&f=live"
self.driver.get(url)
sleep(3)
pass
def get_tweets(self):
self.tweet_cards = self.driver.find_elements(
"xpath", '//article[@data-testid="tweet"]'
)
pass
def scrape_tweets(self, callback=None):
if callback is None:
callback = self.go_to_home
def scrape_tweets(self, router=None):
if router is None:
router = self.router
callback()
router()
if self.scraper_details["type"] == "Username":
print(
"Scraping Tweets from @{}...".format(self.scraper_details["username"])
)
elif self.scraper_details["type"] == "Hashtag":
print(
"Scraping {} Tweets from #{}...".format(
self.scraper_details["tab"], self.scraper_details["hashtag"]
)
)
elif self.scraper_details["type"] == "Query":
print(
"Scraping {} Tweets from {} search...".format(
self.scraper_details["tab"], self.scraper_details["query"]
)
)
elif self.scraper_details["type"] == "Home":
print("Scraping Tweets from Home...")
print("Scraping Tweets...")
self.progress.print_progress(0)
refresh_count = 0
added_tweets = 0
while self.scroller.scrolling:
try:
self.get_tweets()
added_tweets = 0
for card in self.tweet_cards[-15:]:
tweet_id = str(card)
tweet = Tweet(card)
try:
tweet_id = f"{tweet.user}{tweet.handle}{tweet.date_time}"
except Exception as e:
continue
if tweet_id not in self.tweet_ids:
self.tweet_ids.add(tweet_id)
tweet = Tweet(card)
if tweet:
if not tweet.is_ad:
self.data.append(tweet.tweet)
added_tweets += 1
self.progress.print_progress(len(self.data))
if len(self.data) >= self.max_tweets:
@@ -216,6 +300,15 @@ It may be due to the following:
if len(self.data) >= self.max_tweets:
break
if added_tweets == 0:
refresh_count += 1
if refresh_count >= 10:
print()
print("No more tweets to scrape")
break
else:
refresh_count = 0
self.scroller.scroll_count = 0
while True:
@@ -227,23 +320,23 @@ It may be due to the following:
self.scroller.scroll_count += 1
if self.scroller.scroll_count >= 3:
callback()
router()
sleep(2)
self.scroller.reset()
break
else:
sleep(2)
sleep(1)
else:
self.scroller.last_position = self.scroller.current_position
break
except StaleElementReferenceException:
callback()
router()
sleep(2)
except Exception as e:
print("\n")
print(f"Error scraping tweets: {e}")
break
print("\n")
print("")
if len(self.data) >= self.max_tweets:
print("Scraping Complete")