From 7dce277488dcdc66390ac6d2730c7376578b0909 Mon Sep 17 00:00:00 2001 From: Jarrian Date: Sat, 9 Sep 2023 00:34:22 +0800 Subject: [PATCH] file naming for csv --- main.ipynb | 58 +++++++++++++++++++++++++----------------------------- 1 file changed, 27 insertions(+), 31 deletions(-) diff --git a/main.ipynb b/main.ipynb index c49a149..a769382 100644 --- a/main.ipynb +++ b/main.ipynb @@ -2,14 +2,14 @@ "cells": [ { "cell_type": "code", - "execution_count": 61, + "execution_count": 103, "metadata": {}, "outputs": [], "source": [ "import os\n", "import sys\n", - "import re\n", "import pandas as pd\n", + "from datetime import datetime\n", "from fake_headers import Headers\n", "from getpass import getpass\n", "from time import sleep\n", @@ -22,6 +22,12 @@ "\n", "from webdriver_manager.chrome import ChromeDriverManager\n", "\n", + "now = datetime.now()\n", + "folder_path = './tweets/'\n", + "\n", + "if not os.path.exists(folder_path):\n", + " os.makedirs(folder_path)\n", + "\n", "USER_UNAME = os.environ['TWITTER_USERNAME']\n", "USER_PASSWORD = os.environ['TWITTER_PASSWORD']\n", "TWITTER_LOGIN_URL = \"https://twitter.com/i/flow/login\"" @@ -29,7 +35,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 104, "metadata": {}, "outputs": [], "source": [ @@ -47,14 +53,12 @@ " \"-\" * (bar_length - int(bar_length * progress)) + \"]\"\n", " sys.stdout.write(\n", " \"\\rProgress: [{:<40}] {:.2%} {} of {}\".format(progress_bar, progress, current, self.total))\n", - " sys.stdout.flush()\n", - " if current == self.total:\n", - " print(\"\\n\")\n" + " sys.stdout.flush()\n" ] }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 105, "metadata": {}, "outputs": [], "source": [ @@ -76,7 +80,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 106, "metadata": {}, "outputs": [], "source": [ @@ -186,7 +190,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 107, "metadata": {}, "outputs": [], "source": [ @@ -279,30 +283,16 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 108, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Progress: [[======================================--]] 95.20% 476 of 500" - ] - }, - { - "ename": "StaleElementReferenceException", - "evalue": "Message: stale element reference: stale element not found\n (Session info: headless chrome=116.0.5845.142); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#stale-element-reference-exception\nStacktrace:\n\tGetHandleVerifier [0x00DC37C3+48947]\n\t(No symbol) [0x00D58551]\n\t(No symbol) [0x00C5C92D]\n\t(No symbol) [0x00C5FD62]\n\t(No symbol) [0x00C6106A]\n\t(No symbol) [0x00C61110]\n\t(No symbol) [0x00C89B21]\n\t(No symbol) [0x00C89EFB]\n\t(No symbol) [0x00C82F61]\n\t(No symbol) [0x00CA50D4]\n\t(No symbol) [0x00C82EB6]\n\t(No symbol) [0x00CA53E4]\n\t(No symbol) [0x00CB75DA]\n\t(No symbol) [0x00CA4E86]\n\t(No symbol) [0x00C816C7]\n\t(No symbol) [0x00C8284D]\n\tGetHandleVerifier [0x0100FDF9+2458985]\n\tGetHandleVerifier [0x0105744F+2751423]\n\tGetHandleVerifier [0x01051361+2726609]\n\tGetHandleVerifier [0x00E40680+560624]\n\t(No symbol) [0x00D6238C]\n\t(No symbol) [0x00D5E268]\n\t(No symbol) [0x00D5E392]\n\t(No symbol) [0x00D510B7]\n\tBaseThreadInitThunk [0x77517D59+25]\n\tRtlInitializeExceptionChain [0x77D6B79B+107]\n\tRtlClearBits [0x77D6B71F+191]\n", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mStaleElementReferenceException\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[66], line 18\u001b[0m\n\u001b[0;32m 16\u001b[0m \u001b[39mif\u001b[39;00m tweet_id \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m scraper\u001b[39m.\u001b[39mtweet_ids:\n\u001b[0;32m 17\u001b[0m scraper\u001b[39m.\u001b[39mtweet_ids\u001b[39m.\u001b[39madd(tweet_id)\n\u001b[1;32m---> 18\u001b[0m tweet \u001b[39m=\u001b[39m Tweet(card)\n\u001b[0;32m 19\u001b[0m \u001b[39mif\u001b[39;00m tweet:\n\u001b[0;32m 20\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m tweet\u001b[39m.\u001b[39mis_ad:\n", - "Cell \u001b[1;32mIn[65], line 5\u001b[0m, in \u001b[0;36mTweet.__init__\u001b[1;34m(self, card)\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__init__\u001b[39m(\u001b[39mself\u001b[39m, card) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m 3\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcard \u001b[39m=\u001b[39m card\n\u001b[1;32m----> 5\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39muser \u001b[39m=\u001b[39m card\u001b[39m.\u001b[39;49mfind_element(\n\u001b[0;32m 6\u001b[0m \u001b[39m'\u001b[39;49m\u001b[39mxpath\u001b[39;49m\u001b[39m'\u001b[39;49m,\n\u001b[0;32m 7\u001b[0m \u001b[39m'\u001b[39;49m\u001b[39m.//div[@data-testid=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mUser-Name\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m]//span\u001b[39;49m\u001b[39m'\u001b[39;49m\n\u001b[0;32m 8\u001b[0m )\u001b[39m.\u001b[39mtext\n\u001b[0;32m 10\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m 11\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandle \u001b[39m=\u001b[39m card\u001b[39m.\u001b[39mfind_element(\n\u001b[0;32m 12\u001b[0m \u001b[39m'\u001b[39m\u001b[39mxpath\u001b[39m\u001b[39m'\u001b[39m,\n\u001b[0;32m 13\u001b[0m \u001b[39m'\u001b[39m\u001b[39m.//span[contains(text(), \u001b[39m\u001b[39m\"\u001b[39m\u001b[39m@\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m)]\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m 14\u001b[0m )\u001b[39m.\u001b[39mtext\n", - "File \u001b[1;32mc:\\Users\\glori\\anaconda3\\envs\\ml\\lib\\site-packages\\selenium\\webdriver\\remote\\webelement.py:416\u001b[0m, in \u001b[0;36mWebElement.find_element\u001b[1;34m(self, by, value)\u001b[0m\n\u001b[0;32m 413\u001b[0m by \u001b[39m=\u001b[39m By\u001b[39m.\u001b[39mCSS_SELECTOR\n\u001b[0;32m 414\u001b[0m value \u001b[39m=\u001b[39m \u001b[39mf\u001b[39m\u001b[39m'\u001b[39m\u001b[39m[name=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00mvalue\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m\u001b[39m]\u001b[39m\u001b[39m'\u001b[39m\n\u001b[1;32m--> 416\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_execute(Command\u001b[39m.\u001b[39;49mFIND_CHILD_ELEMENT, {\u001b[39m\"\u001b[39;49m\u001b[39musing\u001b[39;49m\u001b[39m\"\u001b[39;49m: by, \u001b[39m\"\u001b[39;49m\u001b[39mvalue\u001b[39;49m\u001b[39m\"\u001b[39;49m: value})[\u001b[39m\"\u001b[39m\u001b[39mvalue\u001b[39m\u001b[39m\"\u001b[39m]\n", - "File \u001b[1;32mc:\\Users\\glori\\anaconda3\\envs\\ml\\lib\\site-packages\\selenium\\webdriver\\remote\\webelement.py:394\u001b[0m, in \u001b[0;36mWebElement._execute\u001b[1;34m(self, command, params)\u001b[0m\n\u001b[0;32m 392\u001b[0m params \u001b[39m=\u001b[39m {}\n\u001b[0;32m 393\u001b[0m params[\u001b[39m\"\u001b[39m\u001b[39mid\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_id\n\u001b[1;32m--> 394\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_parent\u001b[39m.\u001b[39;49mexecute(command, params)\n", - "File \u001b[1;32mc:\\Users\\glori\\anaconda3\\envs\\ml\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py:344\u001b[0m, in \u001b[0;36mWebDriver.execute\u001b[1;34m(self, driver_command, params)\u001b[0m\n\u001b[0;32m 342\u001b[0m response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcommand_executor\u001b[39m.\u001b[39mexecute(driver_command, params)\n\u001b[0;32m 343\u001b[0m \u001b[39mif\u001b[39;00m response:\n\u001b[1;32m--> 344\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49merror_handler\u001b[39m.\u001b[39;49mcheck_response(response)\n\u001b[0;32m 345\u001b[0m response[\u001b[39m\"\u001b[39m\u001b[39mvalue\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_unwrap_value(response\u001b[39m.\u001b[39mget(\u001b[39m\"\u001b[39m\u001b[39mvalue\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mNone\u001b[39;00m))\n\u001b[0;32m 346\u001b[0m \u001b[39mreturn\u001b[39;00m response\n", - "File \u001b[1;32mc:\\Users\\glori\\anaconda3\\envs\\ml\\lib\\site-packages\\selenium\\webdriver\\remote\\errorhandler.py:229\u001b[0m, in \u001b[0;36mErrorHandler.check_response\u001b[1;34m(self, response)\u001b[0m\n\u001b[0;32m 227\u001b[0m alert_text \u001b[39m=\u001b[39m value[\u001b[39m\"\u001b[39m\u001b[39malert\u001b[39m\u001b[39m\"\u001b[39m]\u001b[39m.\u001b[39mget(\u001b[39m\"\u001b[39m\u001b[39mtext\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m 228\u001b[0m \u001b[39mraise\u001b[39;00m exception_class(message, screen, stacktrace, alert_text) \u001b[39m# type: ignore[call-arg] # mypy is not smart enough here\u001b[39;00m\n\u001b[1;32m--> 229\u001b[0m \u001b[39mraise\u001b[39;00m exception_class(message, screen, stacktrace)\n", - "\u001b[1;31mStaleElementReferenceException\u001b[0m: Message: stale element reference: stale element not found\n (Session info: headless chrome=116.0.5845.142); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#stale-element-reference-exception\nStacktrace:\n\tGetHandleVerifier [0x00DC37C3+48947]\n\t(No symbol) [0x00D58551]\n\t(No symbol) [0x00C5C92D]\n\t(No symbol) [0x00C5FD62]\n\t(No symbol) [0x00C6106A]\n\t(No symbol) [0x00C61110]\n\t(No symbol) [0x00C89B21]\n\t(No symbol) [0x00C89EFB]\n\t(No symbol) [0x00C82F61]\n\t(No symbol) [0x00CA50D4]\n\t(No symbol) [0x00C82EB6]\n\t(No symbol) [0x00CA53E4]\n\t(No symbol) [0x00CB75DA]\n\t(No symbol) [0x00CA4E86]\n\t(No symbol) [0x00C816C7]\n\t(No symbol) [0x00C8284D]\n\tGetHandleVerifier [0x0100FDF9+2458985]\n\tGetHandleVerifier [0x0105744F+2751423]\n\tGetHandleVerifier [0x01051361+2726609]\n\tGetHandleVerifier [0x00E40680+560624]\n\t(No symbol) [0x00D6238C]\n\t(No symbol) [0x00D5E268]\n\t(No symbol) [0x00D5E392]\n\t(No symbol) [0x00D510B7]\n\tBaseThreadInitThunk [0x77517D59+25]\n\tRtlInitializeExceptionChain [0x77D6B79B+107]\n\tRtlClearBits [0x77D6B71F+191]\n" + "Progress: [[========================================]] 100.00% 10 of 10\n", + "Scraping Complete\n", + "Tweets: 10 out of 10\n" ] } ], @@ -366,8 +356,10 @@ " scraper.scroller.last_position = scraper.scroller.current_position\n", " break\n", "\n", + " print()\n", " print(\"Scraping Complete\")\n", "except StaleElementReferenceException:\n", + " print()\n", " print(\"Scraping Incomplete\")\n", "\n", "scraper.driver.close()\n", @@ -376,7 +368,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 109, "metadata": {}, "outputs": [], "source": [ @@ -402,7 +394,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 110, "metadata": {}, "outputs": [], "source": [ @@ -418,7 +410,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 111, "metadata": {}, "outputs": [], "source": [ @@ -434,7 +426,11 @@ "}\n", "\n", "df = pd.DataFrame(data)\n", - "df.to_csv('twitter_tweets.csv', index=False)\n" + "\n", + "current_time = datetime.now().strftime(\"%Y-%m-%d-%H-%M\")\n", + "\n", + "file_path = f'{folder_path}{current_time}_tweets_1-{len(scraper.data)}.csv'\n", + "df.to_csv(file_path, index=False)\n" ] } ],