Files
4chanFishtankDowntrend/enhanced_4chan_analysis(3).py
T
2025-12-19 23:21:17 -05:00

247 lines
9.5 KiB
Python

#!/bin/env python3
import json
import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt
from datetime import datetime
from scipy.stats import linregress
class FourChanAnalyzer:
def __init__(self):
self.data = None
self.df = None
self.seasons = {
"Season 1": ("2023-04-18", "2023-05-31"),
"Season 2": ("2023-12-18", "2024-01-28"),
"Season 3": ("2024-10-27", "2024-12-08"),
"Season 4": ("2025-06-13", "2025-06-28"),
"Bloodgames-Bitchtank": ("2024-06-24", "2024-08-02")
}
def fetch_data(self, board="tv"):
"""Fetch data from 4stats.io API"""
print(f"📡 Fetching data from 4stats.io for /{board}/...")
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:139.0) Gecko/20100101 Firefox/139.0",
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "gzip, deflate, br, zstd",
"Origin": "https://4stats.io",
"DNT": "1",
"Connection": "keep-alive",
"Pragma": "no-cache",
"Cache-Control": "no-cache"
}
try:
response = requests.get(f"https://api.4stats.io/history/day/{board}",
headers=headers, timeout=30)
response.raise_for_status()
self.data = response.json()
print(f"✅ Downloaded {len(self.data)} records.")
return True
except requests.exceptions.RequestException as e:
print(f"❌ Failed to fetch data: {e}")
return False
def filter_and_normalize(self, start_year=2023):
"""Filter data from specified year and normalize posts per day"""
if not self.data:
raise ValueError("No data available. Run fetch_data() first.")
# Filter from specified year onward
cutoff = datetime(start_year, 1, 1).timestamp() * 1000
filtered_data = [row for row in self.data if row[0] >= cutoff]
print(f"✂️ Trimmed to {len(filtered_data)} records from {start_year} onward.")
# Clean data - remove rows with None values in critical columns
clean_data = []
for row in filtered_data:
if row[0] is not None and row[2] is not None:
clean_data.append(row)
if len(clean_data) != len(filtered_data):
print(f"🧹 Removed {len(filtered_data) - len(clean_data)} rows with missing data.")
if len(clean_data) == 0:
raise ValueError("No valid data remaining after cleaning.")
# Normalize "Posts Per Day" (column 2, 0-indexed)
timestamps = np.array([row[0] for row in clean_data])
posts_per_day = np.array([row[2] for row in clean_data], dtype=float)
# Remove long-term trend using linear regression
slope, intercept, r_value, p_value, std_err = linregress(timestamps, posts_per_day)
trendline = slope * timestamps + intercept
mean_value = posts_per_day.mean()
adjusted_posts_per_day = posts_per_day - (trendline - mean_value)
print(f"📉 Detrended data - R²: {r_value**2:.4f}, p-value: {p_value:.4e}")
print(f"📈 Trend slope: {slope*86400000:.2f} posts/day per day")
# Create adjusted dataset
adjusted_data = [
[row[0], row[1], float(round(adj, 2)), row[3]]
for row, adj in zip(clean_data, adjusted_posts_per_day)
]
# Convert to DataFrame
self.df = pd.DataFrame(adjusted_data, columns=["Timestamp", "Column_2", "Posts_Per_Day", "Column_4"])
self.df["Date"] = pd.to_datetime(self.df["Timestamp"], unit="ms")
return self.df
def get_season_stats(self):
"""Calculate statistics for each season"""
if self.df is None:
raise ValueError("No processed data available. Run filter_and_normalize() first.")
stats = {}
season_ranges = {
name: (datetime.strptime(start, "%Y-%m-%d"), datetime.strptime(end, "%Y-%m-%d"))
for name, (start, end) in self.seasons.items()
}
for season_name, (start_date, end_date) in season_ranges.items():
season_df = self.df[(self.df["Date"] >= start_date) & (self.df["Date"] <= end_date)]
if len(season_df) > 0:
stats[season_name] = {
"mean": season_df["Posts_Per_Day"].mean(),
"std": season_df["Posts_Per_Day"].std(),
"max": season_df["Posts_Per_Day"].max(),
"min": season_df["Posts_Per_Day"].min(),
"days": len(season_df),
"start_date": start_date,
"end_date": end_date
}
return stats
def plot_seasons_comparison(self, max_days=42, figsize=(12, 8)):
"""Create comparison plot across seasons"""
if self.df is None:
raise ValueError("No processed data available. Run filter_and_normalize() first.")
print(f"📊 Creating comparison graph (max {max_days} days)...")
plt.figure(figsize=figsize)
season_ranges = {
name: (datetime.strptime(start, "%Y-%m-%d"), datetime.strptime(end, "%Y-%m-%d"))
for name, (start, end) in self.seasons.items()
}
# Define specific colors for each season
season_colors = {
"Season 1": "blue",
"Season 2": "orange",
"Season 3": "green",
"Season 4": "red",
"Bloodgames-Bitchtank": "purple"
}
for season_name, (start_date, end_date) in season_ranges.items():
season_df = self.df[(self.df["Date"] >= start_date) & (self.df["Date"] <= end_date)].copy()
if len(season_df) == 0:
print(f"⚠️ No data found for {season_name}")
continue
season_df["Day_Index"] = (season_df["Date"] - start_date).dt.days + 1
season_df = season_df[season_df["Day_Index"] <= max_days]
plt.plot(season_df["Day_Index"], season_df["Posts_Per_Day"],
label=f"{season_name} (n={len(season_df)})",
color=season_colors[season_name], linewidth=2, alpha=0.8)
plt.xlabel("Days from Season Start", fontsize=12)
plt.ylabel("Normalized Posts Per Day", fontsize=12)
plt.title("4chan /tv/ Activity Across Seasons\n(Detrended and Normalized)", fontsize=14, pad=20)
plt.legend(loc='upper right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
return plt.gcf()
def plot_full_timeline(self, figsize=(15, 8)):
"""Plot full timeline with season highlights"""
if self.df is None:
raise ValueError("No processed data available. Run filter_and_normalize() first.")
plt.figure(figsize=figsize)
# Plot full timeline
plt.plot(self.df["Date"], self.df["Posts_Per_Day"],
color='lightgray', alpha=0.7, linewidth=1)
# Highlight seasons
season_ranges = {
name: (datetime.strptime(start, "%Y-%m-%d"), datetime.strptime(end, "%Y-%m-%d"))
for name, (start, end) in self.seasons.items()
}
# Define specific colors for each season
season_colors = {
"Season 1": "blue",
"Season 2": "orange",
"Season 3": "green",
"Season 4": "red",
"Bloodgames-Bitchtank": "purple"
}
for season_name, (start_date, end_date) in season_ranges.items():
season_df = self.df[(self.df["Date"] >= start_date) & (self.df["Date"] <= end_date)]
if len(season_df) > 0:
plt.plot(season_df["Date"], season_df["Posts_Per_Day"],
color=season_colors[season_name], linewidth=3, alpha=0.8, label=season_name)
plt.xlabel("Date", fontsize=12)
plt.ylabel("Normalized Posts Per Day", fontsize=12)
plt.title("4chan /tv/ Activity Timeline with Season Highlights", fontsize=14, pad=20)
plt.legend()
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
return plt.gcf()
def print_summary(self):
"""Print summary statistics"""
stats = self.get_season_stats()
print("\n📊 SEASON SUMMARY STATISTICS")
print("=" * 50)
for season_name, season_stats in stats.items():
print(f"\n{season_name}:")
print(f" Duration: {season_stats['days']} days")
print(f" Mean posts/day: {season_stats['mean']:.1f}")
print(f" Std deviation: {season_stats['std']:.1f}")
print(f" Range: {season_stats['min']:.1f} - {season_stats['max']:.1f}")
print(f" Dates: {season_stats['start_date'].strftime('%Y-%m-%d')} to {season_stats['end_date'].strftime('%Y-%m-%d')}")
def main():
"""Main execution function"""
analyzer = FourChanAnalyzer()
# Fetch and process data
if not analyzer.fetch_data():
return
analyzer.filter_and_normalize()
analyzer.print_summary()
# Create visualizations
fig1 = analyzer.plot_seasons_comparison()
plt.show()
fig2 = analyzer.plot_full_timeline()
plt.show()
if __name__ == "__main__":
main()