In [46]:
import os
import sys
import pandas as pd
from datetime import datetime
from fake_headers import Headers
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException

from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.chrome.service import Service as ChromeService

from webdriver_manager.chrome import ChromeDriverManager

now = datetime.now()
folder_path = './tweets/'

if not os.path.exists(folder_path):
    os.makedirs(folder_path)

USER_UNAME = os.environ['TWITTER_USERNAME']
USER_PASSWORD = os.environ['TWITTER_PASSWORD']
TWITTER_LOGIN_URL = "https://twitter.com/i/flow/login"

In [47]:
class Progress:
  def __init__(self, current, total) -> None:
    self.current = current
    self.total = total
    pass
  
  def print_progress(self, current) -> None:
    self.current = current
    progress = current / self.total
    bar_length = 40
    progress_bar = "[" + "=" * int(bar_length * progress) + \
        "-" * (bar_length - int(bar_length * progress)) + "]"
    sys.stdout.write(
        "\rProgress: [{:<40}] {:.2%} {} of {}".format(progress_bar, progress, current, self.total))
    sys.stdout.flush()


In [48]:
class Scroller():
  def __init__(self, driver) -> None:
    self.driver = driver
    self.current_position = 0
    self.last_position = driver.execute_script("return window.pageYOffset;")
    self.scrolling = True
    self.scroll_count = 0
    pass
  
  def reset(self) -> None:
    self.current_position = 0
    self.last_position = self.driver.execute_script("return window.pageYOffset;")
    self.scroll_count = 0
    pass

In [49]:
class Twitter_Scraper():
  def __init__(self, username, password, max_tweets=50):
    self.username = username
    self.password = password
    self.data = []
    self.tweet_ids = set()
    self.max_tweets = max_tweets
    self.tweet_cards = []
    self.driver = self._get_driver()
    self.scroller = Scroller(self.driver)
    self._login()
  
  def _get_driver(self):
    header = Headers().generate()['User-Agent']

    browser_option = ChromeOptions()
    browser_option.add_argument('--no-sandbox')
    browser_option.add_argument("--disable-dev-shm-usage")
    browser_option.add_argument('--ignore-certificate-errors')
    browser_option.add_argument('--disable-gpu')
    browser_option.add_argument('--log-level=3')
    browser_option.add_argument('--disable-notifications')
    browser_option.add_argument('--disable-popup-blocking')
    browser_option.add_argument('--user-agent={}'.format(header))

    # For Hiding Browser
    browser_option.add_argument("--headless")
    
    chromedriver_path=ChromeDriverManager().install()
    chrome_service = ChromeService(executable_path=chromedriver_path)

    driver = webdriver.Chrome(
      service=chrome_service,
      options=browser_option,
    )
    
    return driver
  
  def _login(self):
    self.driver.get(TWITTER_LOGIN_URL)
    self.driver.maximize_window()
    sleep(3)
    
    self._input_username()
    self._input_unusual_activity()
    self._input_password()
    pass

  def _input_username(self):
    try:
      username = self.driver.find_element(
          "xpath",
          "//input[@autocomplete='username']"
      )

      username.send_keys(self.username)
      username.send_keys(Keys.RETURN)
      sleep(3)

    except NoSuchElementException:
      print("Username field not found")
      self.driver.quit()
      exit()
    pass

  def _input_unusual_activity(self):
    try:
      unusual_activity = self.driver.find_element(
          "xpath",
          "//input[@data-testid='ocfEnterTextTextInput']"
      )
      unusual_activity.send_keys(self.username)
      unusual_activity.send_keys(Keys.RETURN)
      sleep(3)
    except NoSuchElementException:
      pass
    pass

  def _input_password(self):
    try:
      password = self.driver.find_element(
          "xpath",
          "//input[@autocomplete='current-password']"
      )

      password.send_keys(self.password)
      password.send_keys(Keys.RETURN)
      sleep(3)

    except NoSuchElementException:
      print("Password field not found")
      self.driver.quit()
      exit()
    pass
  
  def go_to_home(self):
    self.driver.get("https://twitter.com/home")
    sleep(3)
    pass
  
  def get_tweets(self):
    self.tweet_cards = self.driver.find_elements(
        'xpath',
        '//article[@data-testid="tweet"]'
    )
    pass

In [50]:

class Tweet:  
  def __init__(self, card) -> None:
    self.card = card
    
    self.user = card.find_element(
        'xpath',
        './/div[@data-testid="User-Name"]//span'
    ).text
    
    try:
      self.handle = card.find_element(
        'xpath',
        './/span[contains(text(), "@")]'
      ).text
    except NoSuchElementException:
      return
    
    try:
      self.date_time = card.find_element(
          'xpath',
          './/time'
      ).get_attribute('datetime')
      
      if self.date_time is not None:
        self.is_ad = False
    except NoSuchElementException:
      self.is_ad = True
      return
    
    try:
      card.find_element(
        'xpath',
        './/*[local-name()="svg" and @data-testid="icon-verified"]'
      )
      
      self.verified = True
    except NoSuchElementException:
      self.verified = False
    
    self.content = ""
    contents = card.find_elements(
        'xpath',
        './/div[@data-testid="tweetText"]/span | .//div[@data-testid="tweetText"]/a'
    )

    for index, content in enumerate(contents):
      self.content += content.text
    
    try:
      self.reply_cnt= card.find_element(
        'xpath',
        './/div[@data-testid="reply"]//span'
      ).text
    except NoSuchElementException:
      self.reply_cnt = '0'
    
    try:
      self.retweet_cnt = card.find_element(
        'xpath',
        './/div[@data-testid="retweet"]//span'
      ).text
    except NoSuchElementException:
      self.retweet_cnt = '0'
    
    try:
      self.like_cnt = card.find_element(
        'xpath',
        './/div[@data-testid="like"]//span'
      ).text
    except NoSuchElementException:
      self.like_cnt = '0'
    
    self.tweet = (
      self.user,
      self.handle,
      self.date_time,
      self.verified,
      self.content,
      self.reply_cnt,
      self.retweet_cnt,
      self.like_cnt
    )
    
    pass

In [51]:
scraper = Twitter_Scraper(
  username=USER_UNAME,
  password=USER_PASSWORD,
  max_tweets=50
)

scraper.go_to_home()
progress = Progress(0, scraper.max_tweets)
progress.print_progress(0)

try:
  while scraper.scroller.scrolling:
    scraper.get_tweets()

    for card in scraper.tweet_cards[-15:]:
      tweet_id = str(card)
      if tweet_id not in scraper.tweet_ids:
        scraper.tweet_ids.add(tweet_id)
        tweet = Tweet(card)
        if tweet:
          if not tweet.is_ad:
            scraper.data.append(tweet.tweet)
            progress.print_progress(len(scraper.data))

            if len(scraper.data) >= scraper.max_tweets:
              scraper.scroller.scrolling = False
              break

            if len(scraper.data) % 50 == 0:
              sleep(2)

    if len(scraper.data) >= scraper.max_tweets:
      break

    scraper.scroller.scroll_count = 0

    while True:
      scraper.driver.execute_script(
          'window.scrollTo(0, document.body.scrollHeight);'
      )
      sleep(2)
      scraper.scroller.current_position = scraper.driver.execute_script(
          "return window.pageYOffset;"
      )

      if scraper.scroller.last_position == scraper.scroller.current_position:
        scraper.scroller.scroll_count += 1

        if scraper.scroller.scroll_count >= 3:
          scraper.go_to_home()
          sleep(2)
          scraper.scroller.reset()
          break
        else:
          sleep(2)
      else:
        scraper.scroller.last_position = scraper.scroller.current_position
        break

  print()
  print("Scraping Complete")
except StaleElementReferenceException:
  print()
  print("Scraping Incomplete")

scraper.driver.close()
print("Tweets: {} out of {}".format(len(scraper.data), scraper.max_tweets))

Scraping Complete
Tweets: 50 out of 50


In [52]:
# import tabulate

# # Tabulate
# print(tabulate.tabulate(
#     scraper.data[:10],
#     headers=[
#         'Name',
#         'Handle',
#         'Date Time',
#         'Verified',
#         'Content',
#         'Reply Count',
#         'Retweet Count',
#         'Like Count'
#     ],
#     tablefmt='tsv'
# ))
        

In [53]:
# import csv
# 
# with open('twitter_tweets.csv', 'w', encoding='utf-8', newline='') as f:
#     header = ['Name', 'Handle', 'Timestamp', 'Verified',
#               'Content', 'Comments', 'Retweets', 'Likes']
#     writer = csv.writer(f)
#     writer.writerow(header)
#     writer.writerows(scraper.data)

In [54]:
data = {
  'Name': [tweet[0] for tweet in scraper.data],
  'Handle': [tweet[1] for tweet in scraper.data],
  'Timestamp': [tweet[2] for tweet in scraper.data],
  'Verified': [tweet[3] for tweet in scraper.data],
  'Content': [tweet[4] for tweet in scraper.data],
  'Comments': [tweet[5] for tweet in scraper.data],
  'Retweets': [tweet[6] for tweet in scraper.data],
  'Likes': [tweet[7] for tweet in scraper.data]
}

df = pd.DataFrame(data)

current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

file_path = f'{folder_path}{current_time}_tweets_1-{len(scraper.data)}.csv'
df.to_csv(file_path, index=False)
