4chanFishtankDowntrend/enhanced_4chan_analysis(3).py

#!/bin/env python3

import json
import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt
from datetime import datetime
from scipy.stats import linregress


class FourChanAnalyzer:
    def __init__(self):
        self.data = None
        self.df = None
        self.seasons = {
            "Season 1": ("2023-04-18", "2023-05-31"),
            "Season 2": ("2023-12-18", "2024-01-28"),
            "Season 3": ("2024-10-27", "2024-12-08"),
            "Season 4": ("2025-06-13", "2025-06-28"),
            "Bloodgames-Bitchtank": ("2024-06-24", "2024-08-02")
        }

    def fetch_data(self, board="tv"):
        """Fetch data from 4stats.io API"""
        print(f"📡 Fetching data from 4stats.io for /{board}/...")

        headers = {
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:139.0) Gecko/20100101 Firefox/139.0",
            "Accept": "application/json, text/plain, */*",
            "Accept-Encoding": "gzip, deflate, br, zstd",
            "Origin": "https://4stats.io",
            "DNT": "1",
            "Connection": "keep-alive",
            "Pragma": "no-cache",
            "Cache-Control": "no-cache"
        }

        try:
            response = requests.get(f"https://api.4stats.io/history/day/{board}",
                                  headers=headers, timeout=30)
            response.raise_for_status()
            self.data = response.json()
            print(f"✅ Downloaded {len(self.data)} records.")
            return True
        except requests.exceptions.RequestException as e:
            print(f"❌ Failed to fetch data: {e}")
            return False

    def filter_and_normalize(self, start_year=2023):
        """Filter data from specified year and normalize posts per day"""
        if not self.data:
            raise ValueError("No data available. Run fetch_data() first.")

        # Filter from specified year onward
        cutoff = datetime(start_year, 1, 1).timestamp() * 1000
        filtered_data = [row for row in self.data if row[0] >= cutoff]
        print(f"✂️ Trimmed to {len(filtered_data)} records from {start_year} onward.")

        # Clean data - remove rows with None values in critical columns
        clean_data = []
        for row in filtered_data:
            if row[0] is not None and row[2] is not None:
                clean_data.append(row)

        if len(clean_data) != len(filtered_data):
            print(f"🧹 Removed {len(filtered_data) - len(clean_data)} rows with missing data.")

        if len(clean_data) == 0:
            raise ValueError("No valid data remaining after cleaning.")

        # Normalize "Posts Per Day" (column 2, 0-indexed)
        timestamps = np.array([row[0] for row in clean_data])
        posts_per_day = np.array([row[2] for row in clean_data], dtype=float)

        # Remove long-term trend using linear regression
        slope, intercept, r_value, p_value, std_err = linregress(timestamps, posts_per_day)
        trendline = slope * timestamps + intercept
        mean_value = posts_per_day.mean()
        adjusted_posts_per_day = posts_per_day - (trendline - mean_value)

        print(f"📉 Detrended data - R²: {r_value**2:.4f}, p-value: {p_value:.4e}")
        print(f"📈 Trend slope: {slope*86400000:.2f} posts/day per day")

        # Create adjusted dataset
        adjusted_data = [
            [row[0], row[1], float(round(adj, 2)), row[3]]
            for row, adj in zip(clean_data, adjusted_posts_per_day)
        ]

        # Convert to DataFrame
        self.df = pd.DataFrame(adjusted_data, columns=["Timestamp", "Column_2", "Posts_Per_Day", "Column_4"])
        self.df["Date"] = pd.to_datetime(self.df["Timestamp"], unit="ms")

        return self.df

    def get_season_stats(self):
        """Calculate statistics for each season"""
        if self.df is None:
            raise ValueError("No processed data available. Run filter_and_normalize() first.")

        stats = {}
        season_ranges = {
            name: (datetime.strptime(start, "%Y-%m-%d"), datetime.strptime(end, "%Y-%m-%d"))
            for name, (start, end) in self.seasons.items()
        }

        for season_name, (start_date, end_date) in season_ranges.items():
            season_df = self.df[(self.df["Date"] >= start_date) & (self.df["Date"] <= end_date)]

            if len(season_df) > 0:
                stats[season_name] = {
                    "mean": season_df["Posts_Per_Day"].mean(),
                    "std": season_df["Posts_Per_Day"].std(),
                    "max": season_df["Posts_Per_Day"].max(),
                    "min": season_df["Posts_Per_Day"].min(),
                    "days": len(season_df),
                    "start_date": start_date,
                    "end_date": end_date
                }

        return stats

    def plot_seasons_comparison(self, max_days=42, figsize=(12, 8)):
        """Create comparison plot across seasons"""
        if self.df is None:
            raise ValueError("No processed data available. Run filter_and_normalize() first.")

        print(f"📊 Creating comparison graph (max {max_days} days)...")

        plt.figure(figsize=figsize)
        season_ranges = {
            name: (datetime.strptime(start, "%Y-%m-%d"), datetime.strptime(end, "%Y-%m-%d"))
            for name, (start, end) in self.seasons.items()
        }

        # Define specific colors for each season
        season_colors = {
            "Season 1": "blue",
            "Season 2": "orange",
            "Season 3": "green",
            "Season 4": "red",
            "Bloodgames-Bitchtank": "purple"
        }

        for season_name, (start_date, end_date) in season_ranges.items():
            season_df = self.df[(self.df["Date"] >= start_date) & (self.df["Date"] <= end_date)].copy()

            if len(season_df) == 0:
                print(f"⚠️ No data found for {season_name}")
                continue

            season_df["Day_Index"] = (season_df["Date"] - start_date).dt.days + 1
            season_df = season_df[season_df["Day_Index"] <= max_days]

            plt.plot(season_df["Day_Index"], season_df["Posts_Per_Day"],
                    label=f"{season_name} (n={len(season_df)})",
                    color=season_colors[season_name], linewidth=2, alpha=0.8)

        plt.xlabel("Days from Season Start", fontsize=12)
        plt.ylabel("Normalized Posts Per Day", fontsize=12)
        plt.title("4chan /tv/ Activity Across Seasons\n(Detrended and Normalized)", fontsize=14, pad=20)
        plt.legend(loc='upper right')
        plt.grid(True, alpha=0.3)
        plt.tight_layout()

        return plt.gcf()

    def plot_full_timeline(self, figsize=(15, 8)):
        """Plot full timeline with season highlights"""
        if self.df is None:
            raise ValueError("No processed data available. Run filter_and_normalize() first.")

        plt.figure(figsize=figsize)

        # Plot full timeline
        plt.plot(self.df["Date"], self.df["Posts_Per_Day"],
                color='lightgray', alpha=0.7, linewidth=1)

        # Highlight seasons
        season_ranges = {
            name: (datetime.strptime(start, "%Y-%m-%d"), datetime.strptime(end, "%Y-%m-%d"))
            for name, (start, end) in self.seasons.items()
        }

        # Define specific colors for each season
        season_colors = {
            "Season 1": "blue",
            "Season 2": "orange",
            "Season 3": "green",
            "Season 4": "red",
            "Bloodgames-Bitchtank": "purple"
        }

        for season_name, (start_date, end_date) in season_ranges.items():
            season_df = self.df[(self.df["Date"] >= start_date) & (self.df["Date"] <= end_date)]

            if len(season_df) > 0:
                plt.plot(season_df["Date"], season_df["Posts_Per_Day"],
                        color=season_colors[season_name], linewidth=3, alpha=0.8, label=season_name)

        plt.xlabel("Date", fontsize=12)
        plt.ylabel("Normalized Posts Per Day", fontsize=12)
        plt.title("4chan /tv/ Activity Timeline with Season Highlights", fontsize=14, pad=20)
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.xticks(rotation=45)
        plt.tight_layout()

        return plt.gcf()

    def print_summary(self):
        """Print summary statistics"""
        stats = self.get_season_stats()

        print("\n📊 SEASON SUMMARY STATISTICS")
        print("=" * 50)

        for season_name, season_stats in stats.items():
            print(f"\n{season_name}:")
            print(f"  Duration: {season_stats['days']} days")
            print(f"  Mean posts/day: {season_stats['mean']:.1f}")
            print(f"  Std deviation: {season_stats['std']:.1f}")
            print(f"  Range: {season_stats['min']:.1f} - {season_stats['max']:.1f}")
            print(f"  Dates: {season_stats['start_date'].strftime('%Y-%m-%d')} to {season_stats['end_date'].strftime('%Y-%m-%d')}")

def main():
    """Main execution function"""
    analyzer = FourChanAnalyzer()

    # Fetch and process data
    if not analyzer.fetch_data():
        return

    analyzer.filter_and_normalize()
    analyzer.print_summary()

    # Create visualizations
    fig1 = analyzer.plot_seasons_comparison()
    plt.show()

    fig2 = analyzer.plot_full_timeline()
    plt.show()

if __name__ == "__main__":
    main()