feat: scrape user profile, hashtag, search or query or advanced search, latest tweets, or top tweets
This commit is contained in:
93
README.md
93
README.md
@@ -18,6 +18,12 @@ TWITTER_PASSWORD=# Your Twitter Password
|
|||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
- Show Help
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python scraper --help
|
||||||
|
```
|
||||||
|
|
||||||
- Basic usage
|
- Basic usage
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@@ -27,15 +33,90 @@ python scraper
|
|||||||
- Setting maximum number of tweets. defaults to `50`.
|
- Setting maximum number of tweets. defaults to `50`.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python scraper --tweets=500 # Scrape 500 Tweets
|
python scraper --tweets=500 # Scrape 500 Tweets
|
||||||
```
|
```
|
||||||
|
|
||||||
### Options and Arguments
|
- Options and Arguments
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
usage: python scraper [arg]
|
usage: python scraper [option] ... [arg] ...
|
||||||
|
|
||||||
Arguments Description
|
options: description
|
||||||
--tweets : No. of tweets. default: 50.
|
-t, --tweets : Number of tweets to scrape (default: 50).
|
||||||
e.g. --tweets=500
|
e.g.
|
||||||
|
-t 500
|
||||||
|
--tweets=500
|
||||||
|
|
||||||
|
-u, --username : Twitter username.
|
||||||
|
Scrape tweets from a user's profile.
|
||||||
|
e.g.
|
||||||
|
-u elonmusk
|
||||||
|
--username=@elonmusk
|
||||||
|
|
||||||
|
-ht, --hashtag : Twitter hashtag.
|
||||||
|
Scrape tweets from a hashtag.
|
||||||
|
e.g.
|
||||||
|
-ht javascript
|
||||||
|
--hashtag=javascript
|
||||||
|
|
||||||
|
-q, --query : Twitter query or search.
|
||||||
|
Scrape tweets from a query or search.
|
||||||
|
e.g.
|
||||||
|
-q "Philippine Marites"
|
||||||
|
--query="Jak Roberto anti selos"
|
||||||
|
|
||||||
|
--latest : Twitter latest tweets (default: True).
|
||||||
|
Note: Only for hashtag-based
|
||||||
|
and query-based scraping.
|
||||||
|
usage:
|
||||||
|
python scraper -t 500 -ht=python --latest
|
||||||
|
|
||||||
|
--top : Twitter top tweets (default: False).
|
||||||
|
Note: Only for hashtag-based
|
||||||
|
and query-based scraping.
|
||||||
|
usage:
|
||||||
|
python scraper -t 500 -ht=python --top
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Sample Scraping Commands
|
||||||
|
|
||||||
|
- **Custom Limit Scraping**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python scraper -t 500
|
||||||
|
```
|
||||||
|
|
||||||
|
- **User Profile Scraping**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python scraper -t 100 -u elonmusk
|
||||||
|
```
|
||||||
|
|
||||||
|
- **Hashtag Scraping**
|
||||||
|
|
||||||
|
- Latest
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python scraper -t 100 -ht python --latest
|
||||||
|
```
|
||||||
|
|
||||||
|
- Top
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python scraper -t 100 -ht python --top
|
||||||
|
```
|
||||||
|
|
||||||
|
- **Query or Search Scraping**
|
||||||
|
_(Also works with twitter advanced search.)_
|
||||||
|
|
||||||
|
- Latest
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python scraper -t 100 -q "Jak Roberto Anti Selos" --latest
|
||||||
|
```
|
||||||
|
|
||||||
|
- Top
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python scraper -t 100 -q "International News" --top
|
||||||
|
```
|
||||||
|
|||||||
@@ -34,18 +34,85 @@ def main():
|
|||||||
|
|
||||||
print()
|
print()
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="Twitter Scraper")
|
parser = argparse.ArgumentParser(
|
||||||
|
add_help=True,
|
||||||
|
usage="python scraper [option] ... [arg] ...",
|
||||||
|
description="Twitter Scraper is a tool that allows you to scrape tweets from twitter without using Twitter's API.",
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
"-t",
|
||||||
"--tweets",
|
"--tweets",
|
||||||
type=int,
|
type=int,
|
||||||
default=50,
|
default=50,
|
||||||
help="Number of tweets to scrape (default: 50)",
|
help="Number of tweets to scrape (default: 50)",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"-u",
|
||||||
|
"--username",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Twitter username. Scrape tweets from a user's profile.",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"-ht",
|
||||||
|
"--hashtag",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Twitter hashtag. Scrape tweets from a hashtag.",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"-q",
|
||||||
|
"--query",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Twitter query or search. Scrape tweets from a query or search.",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--latest",
|
||||||
|
action="store_true",
|
||||||
|
help="Scrape latest tweets",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--top",
|
||||||
|
action="store_true",
|
||||||
|
help="Scrape top tweets",
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
tweet_type_args = []
|
||||||
|
|
||||||
|
if args.username is not None:
|
||||||
|
tweet_type_args.append(args.username)
|
||||||
|
if args.hashtag is not None:
|
||||||
|
tweet_type_args.append(args.hashtag)
|
||||||
|
if args.query is not None:
|
||||||
|
tweet_type_args.append(args.query)
|
||||||
|
|
||||||
|
if len(tweet_type_args) > 1:
|
||||||
|
print("Please specify only one of --username, --hashtag, or --query.")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if args.latest and args.top:
|
||||||
|
print("Please specify either --latest or --top. Not both.")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
if USER_UNAME is not None and USER_PASSWORD is not None:
|
if USER_UNAME is not None and USER_PASSWORD is not None:
|
||||||
scraper = Twitter_Scraper(
|
scraper = Twitter_Scraper(
|
||||||
username=USER_UNAME, password=USER_PASSWORD, max_tweets=args.tweets
|
username=USER_UNAME,
|
||||||
|
password=USER_PASSWORD,
|
||||||
|
max_tweets=args.tweets,
|
||||||
|
scrape_username=args.username,
|
||||||
|
scrape_hashtag=args.hashtag,
|
||||||
|
scrape_query=args.query,
|
||||||
|
scrape_latest=args.latest,
|
||||||
|
scrape_top=args.top,
|
||||||
)
|
)
|
||||||
|
|
||||||
scraper.scrape_tweets()
|
scraper.scrape_tweets()
|
||||||
|
|||||||
@@ -25,11 +25,31 @@ TWITTER_LOGIN_URL = "https://twitter.com/i/flow/login"
|
|||||||
|
|
||||||
|
|
||||||
class Twitter_Scraper:
|
class Twitter_Scraper:
|
||||||
def __init__(self, username, password, max_tweets=50):
|
def __init__(
|
||||||
|
self,
|
||||||
|
username,
|
||||||
|
password,
|
||||||
|
max_tweets=50,
|
||||||
|
scrape_username=None,
|
||||||
|
scrape_hashtag=None,
|
||||||
|
scrape_query=None,
|
||||||
|
scrape_latest=True,
|
||||||
|
scrape_top=False,
|
||||||
|
):
|
||||||
print("Initializing Twitter Scraper...")
|
print("Initializing Twitter Scraper...")
|
||||||
self.username = username
|
self.username = username
|
||||||
self.password = password
|
self.password = password
|
||||||
self.data = []
|
self.data = []
|
||||||
|
self.scraper_details = {
|
||||||
|
"type": None,
|
||||||
|
"username": scrape_username,
|
||||||
|
"hashtag": str(scrape_hashtag).replace("#", "")
|
||||||
|
if scrape_hashtag is not None
|
||||||
|
else None,
|
||||||
|
"query": scrape_query,
|
||||||
|
"tab": "Latest" if scrape_latest else "Top" if scrape_top else "Latest",
|
||||||
|
}
|
||||||
|
self.router = self.go_to_home
|
||||||
self.tweet_ids = set()
|
self.tweet_ids = set()
|
||||||
self.max_tweets = max_tweets
|
self.max_tweets = max_tweets
|
||||||
self.progress = Progress(0, max_tweets)
|
self.progress = Progress(0, max_tweets)
|
||||||
@@ -38,6 +58,19 @@ class Twitter_Scraper:
|
|||||||
self.scroller = Scroller(self.driver)
|
self.scroller = Scroller(self.driver)
|
||||||
self._login()
|
self._login()
|
||||||
|
|
||||||
|
if scrape_username is not None:
|
||||||
|
self.scraper_details["type"] = "Username"
|
||||||
|
self.router = self.go_to_profile
|
||||||
|
elif scrape_hashtag is not None:
|
||||||
|
self.scraper_details["type"] = "Hashtag"
|
||||||
|
self.router = self.go_to_hashtag
|
||||||
|
elif scrape_query is not None:
|
||||||
|
self.scraper_details["type"] = "Query"
|
||||||
|
self.router = self.go_to_search
|
||||||
|
else:
|
||||||
|
self.scraper_details["type"] = "Home"
|
||||||
|
self.router = self.go_to_home
|
||||||
|
|
||||||
def _get_driver(self):
|
def _get_driver(self):
|
||||||
print("Setup WebDriver...")
|
print("Setup WebDriver...")
|
||||||
header = Headers().generate()["User-Agent"]
|
header = Headers().generate()["User-Agent"]
|
||||||
@@ -177,33 +210,84 @@ It may be due to the following:
|
|||||||
sleep(3)
|
sleep(3)
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def go_to_profile(self):
|
||||||
|
self.driver.get(f"https://twitter.com/{self.scraper_details['username']}")
|
||||||
|
sleep(3)
|
||||||
|
pass
|
||||||
|
|
||||||
|
def go_to_hashtag(self):
|
||||||
|
url = f"https://twitter.com/hashtag/{self.scraper_details['hashtag']}?src=hashtag_click"
|
||||||
|
if self.scraper_details["tab"] == "Latest":
|
||||||
|
url += "&f=live"
|
||||||
|
|
||||||
|
self.driver.get(url)
|
||||||
|
sleep(3)
|
||||||
|
pass
|
||||||
|
|
||||||
|
def go_to_search(self):
|
||||||
|
url = f"https://twitter.com/search?q={self.scraper_details['query']}&src=typed_query"
|
||||||
|
if self.scraper_details["tab"] == "Latest":
|
||||||
|
url += "&f=live"
|
||||||
|
|
||||||
|
self.driver.get(url)
|
||||||
|
sleep(3)
|
||||||
|
pass
|
||||||
|
|
||||||
def get_tweets(self):
|
def get_tweets(self):
|
||||||
self.tweet_cards = self.driver.find_elements(
|
self.tweet_cards = self.driver.find_elements(
|
||||||
"xpath", '//article[@data-testid="tweet"]'
|
"xpath", '//article[@data-testid="tweet"]'
|
||||||
)
|
)
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def scrape_tweets(self, callback=None):
|
def scrape_tweets(self, router=None):
|
||||||
if callback is None:
|
if router is None:
|
||||||
callback = self.go_to_home
|
router = self.router
|
||||||
|
|
||||||
callback()
|
router()
|
||||||
|
|
||||||
|
if self.scraper_details["type"] == "Username":
|
||||||
|
print(
|
||||||
|
"Scraping Tweets from @{}...".format(self.scraper_details["username"])
|
||||||
|
)
|
||||||
|
elif self.scraper_details["type"] == "Hashtag":
|
||||||
|
print(
|
||||||
|
"Scraping {} Tweets from #{}...".format(
|
||||||
|
self.scraper_details["tab"], self.scraper_details["hashtag"]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
elif self.scraper_details["type"] == "Query":
|
||||||
|
print(
|
||||||
|
"Scraping {} Tweets from {} search...".format(
|
||||||
|
self.scraper_details["tab"], self.scraper_details["query"]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
elif self.scraper_details["type"] == "Home":
|
||||||
|
print("Scraping Tweets from Home...")
|
||||||
|
|
||||||
print("Scraping Tweets...")
|
|
||||||
self.progress.print_progress(0)
|
self.progress.print_progress(0)
|
||||||
|
|
||||||
|
refresh_count = 0
|
||||||
|
added_tweets = 0
|
||||||
|
|
||||||
while self.scroller.scrolling:
|
while self.scroller.scrolling:
|
||||||
try:
|
try:
|
||||||
self.get_tweets()
|
self.get_tweets()
|
||||||
|
added_tweets = 0
|
||||||
|
|
||||||
for card in self.tweet_cards[-15:]:
|
for card in self.tweet_cards[-15:]:
|
||||||
tweet_id = str(card)
|
tweet = Tweet(card)
|
||||||
|
|
||||||
|
try:
|
||||||
|
tweet_id = f"{tweet.user}{tweet.handle}{tweet.date_time}"
|
||||||
|
except Exception as e:
|
||||||
|
continue
|
||||||
|
|
||||||
if tweet_id not in self.tweet_ids:
|
if tweet_id not in self.tweet_ids:
|
||||||
self.tweet_ids.add(tweet_id)
|
self.tweet_ids.add(tweet_id)
|
||||||
tweet = Tweet(card)
|
|
||||||
if tweet:
|
if tweet:
|
||||||
if not tweet.is_ad:
|
if not tweet.is_ad:
|
||||||
self.data.append(tweet.tweet)
|
self.data.append(tweet.tweet)
|
||||||
|
added_tweets += 1
|
||||||
self.progress.print_progress(len(self.data))
|
self.progress.print_progress(len(self.data))
|
||||||
|
|
||||||
if len(self.data) >= self.max_tweets:
|
if len(self.data) >= self.max_tweets:
|
||||||
@@ -216,6 +300,15 @@ It may be due to the following:
|
|||||||
if len(self.data) >= self.max_tweets:
|
if len(self.data) >= self.max_tweets:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
if added_tweets == 0:
|
||||||
|
refresh_count += 1
|
||||||
|
if refresh_count >= 10:
|
||||||
|
print()
|
||||||
|
print("No more tweets to scrape")
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
refresh_count = 0
|
||||||
|
|
||||||
self.scroller.scroll_count = 0
|
self.scroller.scroll_count = 0
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
@@ -227,23 +320,23 @@ It may be due to the following:
|
|||||||
self.scroller.scroll_count += 1
|
self.scroller.scroll_count += 1
|
||||||
|
|
||||||
if self.scroller.scroll_count >= 3:
|
if self.scroller.scroll_count >= 3:
|
||||||
callback()
|
router()
|
||||||
sleep(2)
|
sleep(2)
|
||||||
self.scroller.reset()
|
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
sleep(2)
|
sleep(1)
|
||||||
else:
|
else:
|
||||||
self.scroller.last_position = self.scroller.current_position
|
self.scroller.last_position = self.scroller.current_position
|
||||||
break
|
break
|
||||||
except StaleElementReferenceException:
|
except StaleElementReferenceException:
|
||||||
callback()
|
router()
|
||||||
sleep(2)
|
sleep(2)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
print("\n")
|
||||||
print(f"Error scraping tweets: {e}")
|
print(f"Error scraping tweets: {e}")
|
||||||
break
|
break
|
||||||
|
|
||||||
print("\n")
|
print("")
|
||||||
|
|
||||||
if len(self.data) >= self.max_tweets:
|
if len(self.data) >= self.max_tweets:
|
||||||
print("Scraping Complete")
|
print("Scraping Complete")
|
||||||
|
|||||||
Reference in New Issue
Block a user