diff --git a/enhanced_4chan_analysis(3).py b/enhanced_4chan_analysis(3).py deleted file mode 100644 index 29f1737..0000000 --- a/enhanced_4chan_analysis(3).py +++ /dev/null @@ -1,246 +0,0 @@ -#!/bin/env python3 - -import json -import numpy as np -import pandas as pd -import requests -import matplotlib.pyplot as plt -from datetime import datetime -from scipy.stats import linregress - - -class FourChanAnalyzer: - def __init__(self): - self.data = None - self.df = None - self.seasons = { - "Season 1": ("2023-04-18", "2023-05-31"), - "Season 2": ("2023-12-18", "2024-01-28"), - "Season 3": ("2024-10-27", "2024-12-08"), - "Season 4": ("2025-06-13", "2025-06-28"), - "Bloodgames-Bitchtank": ("2024-06-24", "2024-08-02") - } - - def fetch_data(self, board="tv"): - """Fetch data from 4stats.io API""" - print(f"๐Ÿ“ก Fetching data from 4stats.io for /{board}/...") - - headers = { - "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:139.0) Gecko/20100101 Firefox/139.0", - "Accept": "application/json, text/plain, */*", - "Accept-Encoding": "gzip, deflate, br, zstd", - "Origin": "https://4stats.io", - "DNT": "1", - "Connection": "keep-alive", - "Pragma": "no-cache", - "Cache-Control": "no-cache" - } - - try: - response = requests.get(f"https://api.4stats.io/history/day/{board}", - headers=headers, timeout=30) - response.raise_for_status() - self.data = response.json() - print(f"โœ… Downloaded {len(self.data)} records.") - return True - except requests.exceptions.RequestException as e: - print(f"โŒ Failed to fetch data: {e}") - return False - - def filter_and_normalize(self, start_year=2023): - """Filter data from specified year and normalize posts per day""" - if not self.data: - raise ValueError("No data available. Run fetch_data() first.") - - # Filter from specified year onward - cutoff = datetime(start_year, 1, 1).timestamp() * 1000 - filtered_data = [row for row in self.data if row[0] >= cutoff] - print(f"โœ‚๏ธ Trimmed to {len(filtered_data)} records from {start_year} onward.") - - # Clean data - remove rows with None values in critical columns - clean_data = [] - for row in filtered_data: - if row[0] is not None and row[2] is not None: - clean_data.append(row) - - if len(clean_data) != len(filtered_data): - print(f"๐Ÿงน Removed {len(filtered_data) - len(clean_data)} rows with missing data.") - - if len(clean_data) == 0: - raise ValueError("No valid data remaining after cleaning.") - - # Normalize "Posts Per Day" (column 2, 0-indexed) - timestamps = np.array([row[0] for row in clean_data]) - posts_per_day = np.array([row[2] for row in clean_data], dtype=float) - - # Remove long-term trend using linear regression - slope, intercept, r_value, p_value, std_err = linregress(timestamps, posts_per_day) - trendline = slope * timestamps + intercept - mean_value = posts_per_day.mean() - adjusted_posts_per_day = posts_per_day - (trendline - mean_value) - - print(f"๐Ÿ“‰ Detrended data - Rยฒ: {r_value**2:.4f}, p-value: {p_value:.4e}") - print(f"๐Ÿ“ˆ Trend slope: {slope*86400000:.2f} posts/day per day") - - # Create adjusted dataset - adjusted_data = [ - [row[0], row[1], float(round(adj, 2)), row[3]] - for row, adj in zip(clean_data, adjusted_posts_per_day) - ] - - # Convert to DataFrame - self.df = pd.DataFrame(adjusted_data, columns=["Timestamp", "Column_2", "Posts_Per_Day", "Column_4"]) - self.df["Date"] = pd.to_datetime(self.df["Timestamp"], unit="ms") - - return self.df - - def get_season_stats(self): - """Calculate statistics for each season""" - if self.df is None: - raise ValueError("No processed data available. Run filter_and_normalize() first.") - - stats = {} - season_ranges = { - name: (datetime.strptime(start, "%Y-%m-%d"), datetime.strptime(end, "%Y-%m-%d")) - for name, (start, end) in self.seasons.items() - } - - for season_name, (start_date, end_date) in season_ranges.items(): - season_df = self.df[(self.df["Date"] >= start_date) & (self.df["Date"] <= end_date)] - - if len(season_df) > 0: - stats[season_name] = { - "mean": season_df["Posts_Per_Day"].mean(), - "std": season_df["Posts_Per_Day"].std(), - "max": season_df["Posts_Per_Day"].max(), - "min": season_df["Posts_Per_Day"].min(), - "days": len(season_df), - "start_date": start_date, - "end_date": end_date - } - - return stats - - def plot_seasons_comparison(self, max_days=42, figsize=(12, 8)): - """Create comparison plot across seasons""" - if self.df is None: - raise ValueError("No processed data available. Run filter_and_normalize() first.") - - print(f"๐Ÿ“Š Creating comparison graph (max {max_days} days)...") - - plt.figure(figsize=figsize) - season_ranges = { - name: (datetime.strptime(start, "%Y-%m-%d"), datetime.strptime(end, "%Y-%m-%d")) - for name, (start, end) in self.seasons.items() - } - - # Define specific colors for each season - season_colors = { - "Season 1": "blue", - "Season 2": "orange", - "Season 3": "green", - "Season 4": "red", - "Bloodgames-Bitchtank": "purple" - } - - for season_name, (start_date, end_date) in season_ranges.items(): - season_df = self.df[(self.df["Date"] >= start_date) & (self.df["Date"] <= end_date)].copy() - - if len(season_df) == 0: - print(f"โš ๏ธ No data found for {season_name}") - continue - - season_df["Day_Index"] = (season_df["Date"] - start_date).dt.days + 1 - season_df = season_df[season_df["Day_Index"] <= max_days] - - plt.plot(season_df["Day_Index"], season_df["Posts_Per_Day"], - label=f"{season_name} (n={len(season_df)})", - color=season_colors[season_name], linewidth=2, alpha=0.8) - - plt.xlabel("Days from Season Start", fontsize=12) - plt.ylabel("Normalized Posts Per Day", fontsize=12) - plt.title("4chan /tv/ Activity Across Seasons\n(Detrended and Normalized)", fontsize=14, pad=20) - plt.legend(loc='upper right') - plt.grid(True, alpha=0.3) - plt.tight_layout() - - return plt.gcf() - - def plot_full_timeline(self, figsize=(15, 8)): - """Plot full timeline with season highlights""" - if self.df is None: - raise ValueError("No processed data available. Run filter_and_normalize() first.") - - plt.figure(figsize=figsize) - - # Plot full timeline - plt.plot(self.df["Date"], self.df["Posts_Per_Day"], - color='lightgray', alpha=0.7, linewidth=1) - - # Highlight seasons - season_ranges = { - name: (datetime.strptime(start, "%Y-%m-%d"), datetime.strptime(end, "%Y-%m-%d")) - for name, (start, end) in self.seasons.items() - } - - # Define specific colors for each season - season_colors = { - "Season 1": "blue", - "Season 2": "orange", - "Season 3": "green", - "Season 4": "red", - "Bloodgames-Bitchtank": "purple" - } - - for season_name, (start_date, end_date) in season_ranges.items(): - season_df = self.df[(self.df["Date"] >= start_date) & (self.df["Date"] <= end_date)] - - if len(season_df) > 0: - plt.plot(season_df["Date"], season_df["Posts_Per_Day"], - color=season_colors[season_name], linewidth=3, alpha=0.8, label=season_name) - - plt.xlabel("Date", fontsize=12) - plt.ylabel("Normalized Posts Per Day", fontsize=12) - plt.title("4chan /tv/ Activity Timeline with Season Highlights", fontsize=14, pad=20) - plt.legend() - plt.grid(True, alpha=0.3) - plt.xticks(rotation=45) - plt.tight_layout() - - return plt.gcf() - - def print_summary(self): - """Print summary statistics""" - stats = self.get_season_stats() - - print("\n๐Ÿ“Š SEASON SUMMARY STATISTICS") - print("=" * 50) - - for season_name, season_stats in stats.items(): - print(f"\n{season_name}:") - print(f" Duration: {season_stats['days']} days") - print(f" Mean posts/day: {season_stats['mean']:.1f}") - print(f" Std deviation: {season_stats['std']:.1f}") - print(f" Range: {season_stats['min']:.1f} - {season_stats['max']:.1f}") - print(f" Dates: {season_stats['start_date'].strftime('%Y-%m-%d')} to {season_stats['end_date'].strftime('%Y-%m-%d')}") - -def main(): - """Main execution function""" - analyzer = FourChanAnalyzer() - - # Fetch and process data - if not analyzer.fetch_data(): - return - - analyzer.filter_and_normalize() - analyzer.print_summary() - - # Create visualizations - fig1 = analyzer.plot_seasons_comparison() - plt.show() - - fig2 = analyzer.plot_full_timeline() - plt.show() - -if __name__ == "__main__": - main()