464 lines
22 KiB
Plaintext
464 lines
22 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 61,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import os\n",
|
|
"import sys\n",
|
|
"import re\n",
|
|
"import pandas as pd\n",
|
|
"from fake_headers import Headers\n",
|
|
"from getpass import getpass\n",
|
|
"from time import sleep\n",
|
|
"from selenium import webdriver\n",
|
|
"from selenium.webdriver.common.keys import Keys\n",
|
|
"from selenium.common.exceptions import NoSuchElementException, WebDriverException, StaleElementReferenceException\n",
|
|
"\n",
|
|
"from selenium.webdriver.chrome.options import Options as ChromeOptions\n",
|
|
"from selenium.webdriver.chrome.service import Service as ChromeService\n",
|
|
"\n",
|
|
"from webdriver_manager.chrome import ChromeDriverManager\n",
|
|
"\n",
|
|
"USER_UNAME = os.environ['TWITTER_USERNAME']\n",
|
|
"USER_PASSWORD = os.environ['TWITTER_PASSWORD']\n",
|
|
"TWITTER_LOGIN_URL = \"https://twitter.com/i/flow/login\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 62,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"class Progress:\n",
|
|
" def __init__(self, current, total) -> None:\n",
|
|
" self.current = current\n",
|
|
" self.total = total\n",
|
|
" pass\n",
|
|
" \n",
|
|
" def print_progress(self, current) -> None:\n",
|
|
" self.current = current\n",
|
|
" progress = current / self.total\n",
|
|
" bar_length = 40\n",
|
|
" progress_bar = \"[\" + \"=\" * int(bar_length * progress) + \\\n",
|
|
" \"-\" * (bar_length - int(bar_length * progress)) + \"]\"\n",
|
|
" sys.stdout.write(\n",
|
|
" \"\\rProgress: [{:<40}] {:.2%} {} of {}\".format(progress_bar, progress, current, self.total))\n",
|
|
" sys.stdout.flush()\n",
|
|
" if current == self.total:\n",
|
|
" print(\"\\n\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 63,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"class Scroller():\n",
|
|
" def __init__(self, driver) -> None:\n",
|
|
" self.driver = driver\n",
|
|
" self.current_position = 0\n",
|
|
" self.last_position = driver.execute_script(\"return window.pageYOffset;\")\n",
|
|
" self.scrolling = True\n",
|
|
" self.scroll_count = 0\n",
|
|
" pass\n",
|
|
" \n",
|
|
" def reset(self) -> None:\n",
|
|
" self.current_position = 0\n",
|
|
" self.last_position = self.driver.execute_script(\"return window.pageYOffset;\")\n",
|
|
" self.scroll_count = 0\n",
|
|
" pass"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 64,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"class Twitter_Scraper():\n",
|
|
" def __init__(self, username, password, max_tweets=50):\n",
|
|
" self.username = username\n",
|
|
" self.password = password\n",
|
|
" self.data = []\n",
|
|
" self.tweet_ids = set()\n",
|
|
" self.max_tweets = max_tweets\n",
|
|
" self.tweet_cards = []\n",
|
|
" self.driver = self._get_driver()\n",
|
|
" self.scroller = Scroller(self.driver)\n",
|
|
" self._login()\n",
|
|
" \n",
|
|
" def _get_driver(self):\n",
|
|
" header = Headers().generate()['User-Agent']\n",
|
|
"\n",
|
|
" browser_option = ChromeOptions()\n",
|
|
" browser_option.add_argument('--no-sandbox')\n",
|
|
" browser_option.add_argument(\"--disable-dev-shm-usage\")\n",
|
|
" browser_option.add_argument('--ignore-certificate-errors')\n",
|
|
" browser_option.add_argument('--disable-gpu')\n",
|
|
" browser_option.add_argument('--log-level=3')\n",
|
|
" browser_option.add_argument('--disable-notifications')\n",
|
|
" browser_option.add_argument('--disable-popup-blocking')\n",
|
|
" browser_option.add_argument('--user-agent={}'.format(header))\n",
|
|
"\n",
|
|
" # For Hiding Browser\n",
|
|
" browser_option.add_argument(\"--headless\")\n",
|
|
"\n",
|
|
" driver = webdriver.Chrome(\n",
|
|
" options=browser_option,\n",
|
|
" )\n",
|
|
" \n",
|
|
" return driver\n",
|
|
" \n",
|
|
" def _login(self):\n",
|
|
" self.driver.get(TWITTER_LOGIN_URL)\n",
|
|
" self.driver.maximize_window()\n",
|
|
" sleep(3)\n",
|
|
" \n",
|
|
" self._input_username()\n",
|
|
" self._input_unusual_activity()\n",
|
|
" self._input_password()\n",
|
|
" pass\n",
|
|
"\n",
|
|
" def _input_username(self):\n",
|
|
" try:\n",
|
|
" username = self.driver.find_element(\n",
|
|
" \"xpath\",\n",
|
|
" \"//input[@autocomplete='username']\"\n",
|
|
" )\n",
|
|
"\n",
|
|
" username.send_keys(USER_UNAME)\n",
|
|
" username.send_keys(Keys.RETURN)\n",
|
|
" sleep(3)\n",
|
|
"\n",
|
|
" except NoSuchElementException:\n",
|
|
" print(\"Username field not found\")\n",
|
|
" self.driver.quit()\n",
|
|
" exit()\n",
|
|
" pass\n",
|
|
"\n",
|
|
" def _input_unusual_activity(self):\n",
|
|
" try:\n",
|
|
" unusual_activity = self.driver.find_element(\n",
|
|
" \"xpath\",\n",
|
|
" \"//input[@data-testid='ocfEnterTextTextInput']\"\n",
|
|
" )\n",
|
|
" unusual_activity.send_keys(USER_UNAME)\n",
|
|
" unusual_activity.send_keys(Keys.RETURN)\n",
|
|
" sleep(3)\n",
|
|
" except NoSuchElementException:\n",
|
|
" pass\n",
|
|
" pass\n",
|
|
"\n",
|
|
" def _input_password(self):\n",
|
|
" try:\n",
|
|
" password = self.driver.find_element(\n",
|
|
" \"xpath\",\n",
|
|
" \"//input[@autocomplete='current-password']\"\n",
|
|
" )\n",
|
|
"\n",
|
|
" password.send_keys(USER_PASSWORD)\n",
|
|
" password.send_keys(Keys.RETURN)\n",
|
|
" sleep(3)\n",
|
|
"\n",
|
|
" except NoSuchElementException:\n",
|
|
" print(\"Password field not found\")\n",
|
|
" self.driver.quit()\n",
|
|
" exit()\n",
|
|
" pass\n",
|
|
" \n",
|
|
" def go_to_home(self):\n",
|
|
" self.driver.get(\"https://twitter.com/home\")\n",
|
|
" sleep(3)\n",
|
|
" pass\n",
|
|
" \n",
|
|
" def get_tweets(self):\n",
|
|
" self.tweet_cards = self.driver.find_elements(\n",
|
|
" 'xpath',\n",
|
|
" '//article[@data-testid=\"tweet\"]'\n",
|
|
" )\n",
|
|
" pass"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 65,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"\n",
|
|
"class Tweet: \n",
|
|
" def __init__(self, card) -> None:\n",
|
|
" self.card = card\n",
|
|
" \n",
|
|
" self.user = card.find_element(\n",
|
|
" 'xpath',\n",
|
|
" './/div[@data-testid=\"User-Name\"]//span'\n",
|
|
" ).text\n",
|
|
" \n",
|
|
" try:\n",
|
|
" self.handle = card.find_element(\n",
|
|
" 'xpath',\n",
|
|
" './/span[contains(text(), \"@\")]'\n",
|
|
" ).text\n",
|
|
" except NoSuchElementException:\n",
|
|
" return\n",
|
|
" \n",
|
|
" try:\n",
|
|
" self.date_time = card.find_element(\n",
|
|
" 'xpath',\n",
|
|
" './/time'\n",
|
|
" ).get_attribute('datetime')\n",
|
|
" \n",
|
|
" if self.date_time is not None:\n",
|
|
" self.is_ad = False\n",
|
|
" except NoSuchElementException:\n",
|
|
" self.is_ad = True\n",
|
|
" return\n",
|
|
" \n",
|
|
" try:\n",
|
|
" card.find_element(\n",
|
|
" 'xpath',\n",
|
|
" './/*[local-name()=\"svg\" and @data-testid=\"icon-verified\"]'\n",
|
|
" )\n",
|
|
" \n",
|
|
" self.verified = True\n",
|
|
" except NoSuchElementException:\n",
|
|
" self.verified = False\n",
|
|
" \n",
|
|
" self.content = \"\"\n",
|
|
" contents = card.find_elements(\n",
|
|
" 'xpath',\n",
|
|
" './/div[@data-testid=\"tweetText\"]/span | .//div[@data-testid=\"tweetText\"]/a'\n",
|
|
" )\n",
|
|
"\n",
|
|
" for index, content in enumerate(contents):\n",
|
|
" self.content += content.text\n",
|
|
" \n",
|
|
" try:\n",
|
|
" self.reply_cnt= card.find_element(\n",
|
|
" 'xpath',\n",
|
|
" './/div[@data-testid=\"reply\"]//span'\n",
|
|
" ).text\n",
|
|
" except NoSuchElementException:\n",
|
|
" self.reply_cnt = '0'\n",
|
|
" \n",
|
|
" try:\n",
|
|
" self.retweet_cnt = card.find_element(\n",
|
|
" 'xpath',\n",
|
|
" './/div[@data-testid=\"retweet\"]//span'\n",
|
|
" ).text\n",
|
|
" except NoSuchElementException:\n",
|
|
" self.retweet_cnt = '0'\n",
|
|
" \n",
|
|
" try:\n",
|
|
" self.like_cnt = card.find_element(\n",
|
|
" 'xpath',\n",
|
|
" './/div[@data-testid=\"like\"]//span'\n",
|
|
" ).text\n",
|
|
" except NoSuchElementException:\n",
|
|
" self.like_cnt = '0'\n",
|
|
" \n",
|
|
" self.tweet = (\n",
|
|
" self.user,\n",
|
|
" self.handle,\n",
|
|
" self.date_time,\n",
|
|
" self.verified,\n",
|
|
" self.content,\n",
|
|
" self.reply_cnt,\n",
|
|
" self.retweet_cnt,\n",
|
|
" self.like_cnt\n",
|
|
" )\n",
|
|
" \n",
|
|
" pass"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 66,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Progress: [[======================================--]] 95.20% 476 of 500"
|
|
]
|
|
},
|
|
{
|
|
"ename": "StaleElementReferenceException",
|
|
"evalue": "Message: stale element reference: stale element not found\n (Session info: headless chrome=116.0.5845.142); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#stale-element-reference-exception\nStacktrace:\n\tGetHandleVerifier [0x00DC37C3+48947]\n\t(No symbol) [0x00D58551]\n\t(No symbol) [0x00C5C92D]\n\t(No symbol) [0x00C5FD62]\n\t(No symbol) [0x00C6106A]\n\t(No symbol) [0x00C61110]\n\t(No symbol) [0x00C89B21]\n\t(No symbol) [0x00C89EFB]\n\t(No symbol) [0x00C82F61]\n\t(No symbol) [0x00CA50D4]\n\t(No symbol) [0x00C82EB6]\n\t(No symbol) [0x00CA53E4]\n\t(No symbol) [0x00CB75DA]\n\t(No symbol) [0x00CA4E86]\n\t(No symbol) [0x00C816C7]\n\t(No symbol) [0x00C8284D]\n\tGetHandleVerifier [0x0100FDF9+2458985]\n\tGetHandleVerifier [0x0105744F+2751423]\n\tGetHandleVerifier [0x01051361+2726609]\n\tGetHandleVerifier [0x00E40680+560624]\n\t(No symbol) [0x00D6238C]\n\t(No symbol) [0x00D5E268]\n\t(No symbol) [0x00D5E392]\n\t(No symbol) [0x00D510B7]\n\tBaseThreadInitThunk [0x77517D59+25]\n\tRtlInitializeExceptionChain [0x77D6B79B+107]\n\tRtlClearBits [0x77D6B71F+191]\n",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
|
"\u001b[1;31mStaleElementReferenceException\u001b[0m Traceback (most recent call last)",
|
|
"Cell \u001b[1;32mIn[66], line 18\u001b[0m\n\u001b[0;32m 16\u001b[0m \u001b[39mif\u001b[39;00m tweet_id \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m scraper\u001b[39m.\u001b[39mtweet_ids:\n\u001b[0;32m 17\u001b[0m scraper\u001b[39m.\u001b[39mtweet_ids\u001b[39m.\u001b[39madd(tweet_id)\n\u001b[1;32m---> 18\u001b[0m tweet \u001b[39m=\u001b[39m Tweet(card)\n\u001b[0;32m 19\u001b[0m \u001b[39mif\u001b[39;00m tweet:\n\u001b[0;32m 20\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m tweet\u001b[39m.\u001b[39mis_ad:\n",
|
|
"Cell \u001b[1;32mIn[65], line 5\u001b[0m, in \u001b[0;36mTweet.__init__\u001b[1;34m(self, card)\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__init__\u001b[39m(\u001b[39mself\u001b[39m, card) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m 3\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcard \u001b[39m=\u001b[39m card\n\u001b[1;32m----> 5\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39muser \u001b[39m=\u001b[39m card\u001b[39m.\u001b[39;49mfind_element(\n\u001b[0;32m 6\u001b[0m \u001b[39m'\u001b[39;49m\u001b[39mxpath\u001b[39;49m\u001b[39m'\u001b[39;49m,\n\u001b[0;32m 7\u001b[0m \u001b[39m'\u001b[39;49m\u001b[39m.//div[@data-testid=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mUser-Name\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m]//span\u001b[39;49m\u001b[39m'\u001b[39;49m\n\u001b[0;32m 8\u001b[0m )\u001b[39m.\u001b[39mtext\n\u001b[0;32m 10\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m 11\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandle \u001b[39m=\u001b[39m card\u001b[39m.\u001b[39mfind_element(\n\u001b[0;32m 12\u001b[0m \u001b[39m'\u001b[39m\u001b[39mxpath\u001b[39m\u001b[39m'\u001b[39m,\n\u001b[0;32m 13\u001b[0m \u001b[39m'\u001b[39m\u001b[39m.//span[contains(text(), \u001b[39m\u001b[39m\"\u001b[39m\u001b[39m@\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m)]\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m 14\u001b[0m )\u001b[39m.\u001b[39mtext\n",
|
|
"File \u001b[1;32mc:\\Users\\glori\\anaconda3\\envs\\ml\\lib\\site-packages\\selenium\\webdriver\\remote\\webelement.py:416\u001b[0m, in \u001b[0;36mWebElement.find_element\u001b[1;34m(self, by, value)\u001b[0m\n\u001b[0;32m 413\u001b[0m by \u001b[39m=\u001b[39m By\u001b[39m.\u001b[39mCSS_SELECTOR\n\u001b[0;32m 414\u001b[0m value \u001b[39m=\u001b[39m \u001b[39mf\u001b[39m\u001b[39m'\u001b[39m\u001b[39m[name=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00mvalue\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m\u001b[39m]\u001b[39m\u001b[39m'\u001b[39m\n\u001b[1;32m--> 416\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_execute(Command\u001b[39m.\u001b[39;49mFIND_CHILD_ELEMENT, {\u001b[39m\"\u001b[39;49m\u001b[39musing\u001b[39;49m\u001b[39m\"\u001b[39;49m: by, \u001b[39m\"\u001b[39;49m\u001b[39mvalue\u001b[39;49m\u001b[39m\"\u001b[39;49m: value})[\u001b[39m\"\u001b[39m\u001b[39mvalue\u001b[39m\u001b[39m\"\u001b[39m]\n",
|
|
"File \u001b[1;32mc:\\Users\\glori\\anaconda3\\envs\\ml\\lib\\site-packages\\selenium\\webdriver\\remote\\webelement.py:394\u001b[0m, in \u001b[0;36mWebElement._execute\u001b[1;34m(self, command, params)\u001b[0m\n\u001b[0;32m 392\u001b[0m params \u001b[39m=\u001b[39m {}\n\u001b[0;32m 393\u001b[0m params[\u001b[39m\"\u001b[39m\u001b[39mid\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_id\n\u001b[1;32m--> 394\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_parent\u001b[39m.\u001b[39;49mexecute(command, params)\n",
|
|
"File \u001b[1;32mc:\\Users\\glori\\anaconda3\\envs\\ml\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py:344\u001b[0m, in \u001b[0;36mWebDriver.execute\u001b[1;34m(self, driver_command, params)\u001b[0m\n\u001b[0;32m 342\u001b[0m response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcommand_executor\u001b[39m.\u001b[39mexecute(driver_command, params)\n\u001b[0;32m 343\u001b[0m \u001b[39mif\u001b[39;00m response:\n\u001b[1;32m--> 344\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49merror_handler\u001b[39m.\u001b[39;49mcheck_response(response)\n\u001b[0;32m 345\u001b[0m response[\u001b[39m\"\u001b[39m\u001b[39mvalue\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_unwrap_value(response\u001b[39m.\u001b[39mget(\u001b[39m\"\u001b[39m\u001b[39mvalue\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mNone\u001b[39;00m))\n\u001b[0;32m 346\u001b[0m \u001b[39mreturn\u001b[39;00m response\n",
|
|
"File \u001b[1;32mc:\\Users\\glori\\anaconda3\\envs\\ml\\lib\\site-packages\\selenium\\webdriver\\remote\\errorhandler.py:229\u001b[0m, in \u001b[0;36mErrorHandler.check_response\u001b[1;34m(self, response)\u001b[0m\n\u001b[0;32m 227\u001b[0m alert_text \u001b[39m=\u001b[39m value[\u001b[39m\"\u001b[39m\u001b[39malert\u001b[39m\u001b[39m\"\u001b[39m]\u001b[39m.\u001b[39mget(\u001b[39m\"\u001b[39m\u001b[39mtext\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m 228\u001b[0m \u001b[39mraise\u001b[39;00m exception_class(message, screen, stacktrace, alert_text) \u001b[39m# type: ignore[call-arg] # mypy is not smart enough here\u001b[39;00m\n\u001b[1;32m--> 229\u001b[0m \u001b[39mraise\u001b[39;00m exception_class(message, screen, stacktrace)\n",
|
|
"\u001b[1;31mStaleElementReferenceException\u001b[0m: Message: stale element reference: stale element not found\n (Session info: headless chrome=116.0.5845.142); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#stale-element-reference-exception\nStacktrace:\n\tGetHandleVerifier [0x00DC37C3+48947]\n\t(No symbol) [0x00D58551]\n\t(No symbol) [0x00C5C92D]\n\t(No symbol) [0x00C5FD62]\n\t(No symbol) [0x00C6106A]\n\t(No symbol) [0x00C61110]\n\t(No symbol) [0x00C89B21]\n\t(No symbol) [0x00C89EFB]\n\t(No symbol) [0x00C82F61]\n\t(No symbol) [0x00CA50D4]\n\t(No symbol) [0x00C82EB6]\n\t(No symbol) [0x00CA53E4]\n\t(No symbol) [0x00CB75DA]\n\t(No symbol) [0x00CA4E86]\n\t(No symbol) [0x00C816C7]\n\t(No symbol) [0x00C8284D]\n\tGetHandleVerifier [0x0100FDF9+2458985]\n\tGetHandleVerifier [0x0105744F+2751423]\n\tGetHandleVerifier [0x01051361+2726609]\n\tGetHandleVerifier [0x00E40680+560624]\n\t(No symbol) [0x00D6238C]\n\t(No symbol) [0x00D5E268]\n\t(No symbol) [0x00D5E392]\n\t(No symbol) [0x00D510B7]\n\tBaseThreadInitThunk [0x77517D59+25]\n\tRtlInitializeExceptionChain [0x77D6B79B+107]\n\tRtlClearBits [0x77D6B71F+191]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"scraper = Twitter_Scraper(\n",
|
|
" username=USER_UNAME,\n",
|
|
" password=USER_PASSWORD,\n",
|
|
" max_tweets=500\n",
|
|
")\n",
|
|
"\n",
|
|
"scraper.go_to_home()\n",
|
|
"progress = Progress(0, scraper.max_tweets)\n",
|
|
"progress.print_progress(0)\n",
|
|
"\n",
|
|
"try:\n",
|
|
" while scraper.scroller.scrolling:\n",
|
|
" scraper.get_tweets()\n",
|
|
"\n",
|
|
" for card in scraper.tweet_cards[-15:]:\n",
|
|
" tweet_id = str(card)\n",
|
|
" if tweet_id not in scraper.tweet_ids:\n",
|
|
" scraper.tweet_ids.add(tweet_id)\n",
|
|
" tweet = Tweet(card)\n",
|
|
" if tweet:\n",
|
|
" if not tweet.is_ad:\n",
|
|
" scraper.data.append(tweet.tweet)\n",
|
|
" progress.print_progress(len(scraper.data))\n",
|
|
"\n",
|
|
" if len(scraper.data) >= scraper.max_tweets:\n",
|
|
" scraper.scroller.scrolling = False\n",
|
|
" break\n",
|
|
"\n",
|
|
" if len(scraper.data) % 50 == 0:\n",
|
|
" sleep(2)\n",
|
|
"\n",
|
|
" if len(scraper.data) >= scraper.max_tweets:\n",
|
|
" break\n",
|
|
"\n",
|
|
" scraper.scroller.scroll_count = 0\n",
|
|
"\n",
|
|
" while True:\n",
|
|
" scraper.driver.execute_script(\n",
|
|
" 'window.scrollTo(0, document.body.scrollHeight);'\n",
|
|
" )\n",
|
|
" sleep(2)\n",
|
|
" scraper.scroller.current_position = scraper.driver.execute_script(\n",
|
|
" \"return window.pageYOffset;\"\n",
|
|
" )\n",
|
|
"\n",
|
|
" if scraper.scroller.last_position == scraper.scroller.current_position:\n",
|
|
" scraper.scroller.scroll_count += 1\n",
|
|
"\n",
|
|
" if scraper.scroller.scroll_count >= 3:\n",
|
|
" scraper.go_to_home()\n",
|
|
" sleep(2)\n",
|
|
" scraper.scroller.reset()\n",
|
|
" break\n",
|
|
" else:\n",
|
|
" sleep(2)\n",
|
|
" else:\n",
|
|
" scraper.scroller.last_position = scraper.scroller.current_position\n",
|
|
" break\n",
|
|
"\n",
|
|
" print(\"Scraping Complete\")\n",
|
|
"except StaleElementReferenceException:\n",
|
|
" print(\"Scraping Incomplete\")\n",
|
|
"\n",
|
|
"scraper.driver.close()\n",
|
|
"print(\"Tweets: {} out of {}\".format(len(scraper.data), scraper.max_tweets))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# import tabulate\n",
|
|
"\n",
|
|
"# # Tabulate\n",
|
|
"# print(tabulate.tabulate(\n",
|
|
"# scraper.data[:10],\n",
|
|
"# headers=[\n",
|
|
"# 'Name',\n",
|
|
"# 'Handle',\n",
|
|
"# 'Date Time',\n",
|
|
"# 'Verified',\n",
|
|
"# 'Content',\n",
|
|
"# 'Reply Count',\n",
|
|
"# 'Retweet Count',\n",
|
|
"# 'Like Count'\n",
|
|
"# ],\n",
|
|
"# tablefmt='tsv'\n",
|
|
"# ))\n",
|
|
" "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# import csv\n",
|
|
"# \n",
|
|
"# with open('twitter_tweets.csv', 'w', encoding='utf-8', newline='') as f:\n",
|
|
"# header = ['Name', 'Handle', 'Timestamp', 'Verified',\n",
|
|
"# 'Content', 'Comments', 'Retweets', 'Likes']\n",
|
|
"# writer = csv.writer(f)\n",
|
|
"# writer.writerow(header)\n",
|
|
"# writer.writerows(scraper.data)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"data = {\n",
|
|
" 'Name': [tweet[0] for tweet in scraper.data],\n",
|
|
" 'Handle': [tweet[1] for tweet in scraper.data],\n",
|
|
" 'Timestamp': [tweet[2] for tweet in scraper.data],\n",
|
|
" 'Verified': [tweet[3] for tweet in scraper.data],\n",
|
|
" 'Content': [tweet[4] for tweet in scraper.data],\n",
|
|
" 'Comments': [tweet[5] for tweet in scraper.data],\n",
|
|
" 'Retweets': [tweet[6] for tweet in scraper.data],\n",
|
|
" 'Likes': [tweet[7] for tweet in scraper.data]\n",
|
|
"}\n",
|
|
"\n",
|
|
"df = pd.DataFrame(data)\n",
|
|
"df.to_csv('twitter_tweets.csv', index=False)\n"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "ml",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.12"
|
|
},
|
|
"orig_nbformat": 4
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|