247 lines
9.5 KiB
Python
247 lines
9.5 KiB
Python
#!/bin/env python3
|
|
|
|
import json
|
|
import numpy as np
|
|
import pandas as pd
|
|
import requests
|
|
import matplotlib.pyplot as plt
|
|
from datetime import datetime
|
|
from scipy.stats import linregress
|
|
|
|
|
|
class FourChanAnalyzer:
|
|
def __init__(self):
|
|
self.data = None
|
|
self.df = None
|
|
self.seasons = {
|
|
"Season 1": ("2023-04-18", "2023-05-31"),
|
|
"Season 2": ("2023-12-18", "2024-01-28"),
|
|
"Season 3": ("2024-10-27", "2024-12-08"),
|
|
"Season 4": ("2025-06-13", "2025-06-28"),
|
|
"Bloodgames-Bitchtank": ("2024-06-24", "2024-08-02")
|
|
}
|
|
|
|
def fetch_data(self, board="tv"):
|
|
"""Fetch data from 4stats.io API"""
|
|
print(f"📡 Fetching data from 4stats.io for /{board}/...")
|
|
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:139.0) Gecko/20100101 Firefox/139.0",
|
|
"Accept": "application/json, text/plain, */*",
|
|
"Accept-Encoding": "gzip, deflate, br, zstd",
|
|
"Origin": "https://4stats.io",
|
|
"DNT": "1",
|
|
"Connection": "keep-alive",
|
|
"Pragma": "no-cache",
|
|
"Cache-Control": "no-cache"
|
|
}
|
|
|
|
try:
|
|
response = requests.get(f"https://api.4stats.io/history/day/{board}",
|
|
headers=headers, timeout=30)
|
|
response.raise_for_status()
|
|
self.data = response.json()
|
|
print(f"✅ Downloaded {len(self.data)} records.")
|
|
return True
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"❌ Failed to fetch data: {e}")
|
|
return False
|
|
|
|
def filter_and_normalize(self, start_year=2023):
|
|
"""Filter data from specified year and normalize posts per day"""
|
|
if not self.data:
|
|
raise ValueError("No data available. Run fetch_data() first.")
|
|
|
|
# Filter from specified year onward
|
|
cutoff = datetime(start_year, 1, 1).timestamp() * 1000
|
|
filtered_data = [row for row in self.data if row[0] >= cutoff]
|
|
print(f"✂️ Trimmed to {len(filtered_data)} records from {start_year} onward.")
|
|
|
|
# Clean data - remove rows with None values in critical columns
|
|
clean_data = []
|
|
for row in filtered_data:
|
|
if row[0] is not None and row[2] is not None:
|
|
clean_data.append(row)
|
|
|
|
if len(clean_data) != len(filtered_data):
|
|
print(f"🧹 Removed {len(filtered_data) - len(clean_data)} rows with missing data.")
|
|
|
|
if len(clean_data) == 0:
|
|
raise ValueError("No valid data remaining after cleaning.")
|
|
|
|
# Normalize "Posts Per Day" (column 2, 0-indexed)
|
|
timestamps = np.array([row[0] for row in clean_data])
|
|
posts_per_day = np.array([row[2] for row in clean_data], dtype=float)
|
|
|
|
# Remove long-term trend using linear regression
|
|
slope, intercept, r_value, p_value, std_err = linregress(timestamps, posts_per_day)
|
|
trendline = slope * timestamps + intercept
|
|
mean_value = posts_per_day.mean()
|
|
adjusted_posts_per_day = posts_per_day - (trendline - mean_value)
|
|
|
|
print(f"📉 Detrended data - R²: {r_value**2:.4f}, p-value: {p_value:.4e}")
|
|
print(f"📈 Trend slope: {slope*86400000:.2f} posts/day per day")
|
|
|
|
# Create adjusted dataset
|
|
adjusted_data = [
|
|
[row[0], row[1], float(round(adj, 2)), row[3]]
|
|
for row, adj in zip(clean_data, adjusted_posts_per_day)
|
|
]
|
|
|
|
# Convert to DataFrame
|
|
self.df = pd.DataFrame(adjusted_data, columns=["Timestamp", "Column_2", "Posts_Per_Day", "Column_4"])
|
|
self.df["Date"] = pd.to_datetime(self.df["Timestamp"], unit="ms")
|
|
|
|
return self.df
|
|
|
|
def get_season_stats(self):
|
|
"""Calculate statistics for each season"""
|
|
if self.df is None:
|
|
raise ValueError("No processed data available. Run filter_and_normalize() first.")
|
|
|
|
stats = {}
|
|
season_ranges = {
|
|
name: (datetime.strptime(start, "%Y-%m-%d"), datetime.strptime(end, "%Y-%m-%d"))
|
|
for name, (start, end) in self.seasons.items()
|
|
}
|
|
|
|
for season_name, (start_date, end_date) in season_ranges.items():
|
|
season_df = self.df[(self.df["Date"] >= start_date) & (self.df["Date"] <= end_date)]
|
|
|
|
if len(season_df) > 0:
|
|
stats[season_name] = {
|
|
"mean": season_df["Posts_Per_Day"].mean(),
|
|
"std": season_df["Posts_Per_Day"].std(),
|
|
"max": season_df["Posts_Per_Day"].max(),
|
|
"min": season_df["Posts_Per_Day"].min(),
|
|
"days": len(season_df),
|
|
"start_date": start_date,
|
|
"end_date": end_date
|
|
}
|
|
|
|
return stats
|
|
|
|
def plot_seasons_comparison(self, max_days=42, figsize=(12, 8)):
|
|
"""Create comparison plot across seasons"""
|
|
if self.df is None:
|
|
raise ValueError("No processed data available. Run filter_and_normalize() first.")
|
|
|
|
print(f"📊 Creating comparison graph (max {max_days} days)...")
|
|
|
|
plt.figure(figsize=figsize)
|
|
season_ranges = {
|
|
name: (datetime.strptime(start, "%Y-%m-%d"), datetime.strptime(end, "%Y-%m-%d"))
|
|
for name, (start, end) in self.seasons.items()
|
|
}
|
|
|
|
# Define specific colors for each season
|
|
season_colors = {
|
|
"Season 1": "blue",
|
|
"Season 2": "orange",
|
|
"Season 3": "green",
|
|
"Season 4": "red",
|
|
"Bloodgames-Bitchtank": "purple"
|
|
}
|
|
|
|
for season_name, (start_date, end_date) in season_ranges.items():
|
|
season_df = self.df[(self.df["Date"] >= start_date) & (self.df["Date"] <= end_date)].copy()
|
|
|
|
if len(season_df) == 0:
|
|
print(f"⚠️ No data found for {season_name}")
|
|
continue
|
|
|
|
season_df["Day_Index"] = (season_df["Date"] - start_date).dt.days + 1
|
|
season_df = season_df[season_df["Day_Index"] <= max_days]
|
|
|
|
plt.plot(season_df["Day_Index"], season_df["Posts_Per_Day"],
|
|
label=f"{season_name} (n={len(season_df)})",
|
|
color=season_colors[season_name], linewidth=2, alpha=0.8)
|
|
|
|
plt.xlabel("Days from Season Start", fontsize=12)
|
|
plt.ylabel("Normalized Posts Per Day", fontsize=12)
|
|
plt.title("4chan /tv/ Activity Across Seasons\n(Detrended and Normalized)", fontsize=14, pad=20)
|
|
plt.legend(loc='upper right')
|
|
plt.grid(True, alpha=0.3)
|
|
plt.tight_layout()
|
|
|
|
return plt.gcf()
|
|
|
|
def plot_full_timeline(self, figsize=(15, 8)):
|
|
"""Plot full timeline with season highlights"""
|
|
if self.df is None:
|
|
raise ValueError("No processed data available. Run filter_and_normalize() first.")
|
|
|
|
plt.figure(figsize=figsize)
|
|
|
|
# Plot full timeline
|
|
plt.plot(self.df["Date"], self.df["Posts_Per_Day"],
|
|
color='lightgray', alpha=0.7, linewidth=1)
|
|
|
|
# Highlight seasons
|
|
season_ranges = {
|
|
name: (datetime.strptime(start, "%Y-%m-%d"), datetime.strptime(end, "%Y-%m-%d"))
|
|
for name, (start, end) in self.seasons.items()
|
|
}
|
|
|
|
# Define specific colors for each season
|
|
season_colors = {
|
|
"Season 1": "blue",
|
|
"Season 2": "orange",
|
|
"Season 3": "green",
|
|
"Season 4": "red",
|
|
"Bloodgames-Bitchtank": "purple"
|
|
}
|
|
|
|
for season_name, (start_date, end_date) in season_ranges.items():
|
|
season_df = self.df[(self.df["Date"] >= start_date) & (self.df["Date"] <= end_date)]
|
|
|
|
if len(season_df) > 0:
|
|
plt.plot(season_df["Date"], season_df["Posts_Per_Day"],
|
|
color=season_colors[season_name], linewidth=3, alpha=0.8, label=season_name)
|
|
|
|
plt.xlabel("Date", fontsize=12)
|
|
plt.ylabel("Normalized Posts Per Day", fontsize=12)
|
|
plt.title("4chan /tv/ Activity Timeline with Season Highlights", fontsize=14, pad=20)
|
|
plt.legend()
|
|
plt.grid(True, alpha=0.3)
|
|
plt.xticks(rotation=45)
|
|
plt.tight_layout()
|
|
|
|
return plt.gcf()
|
|
|
|
def print_summary(self):
|
|
"""Print summary statistics"""
|
|
stats = self.get_season_stats()
|
|
|
|
print("\n📊 SEASON SUMMARY STATISTICS")
|
|
print("=" * 50)
|
|
|
|
for season_name, season_stats in stats.items():
|
|
print(f"\n{season_name}:")
|
|
print(f" Duration: {season_stats['days']} days")
|
|
print(f" Mean posts/day: {season_stats['mean']:.1f}")
|
|
print(f" Std deviation: {season_stats['std']:.1f}")
|
|
print(f" Range: {season_stats['min']:.1f} - {season_stats['max']:.1f}")
|
|
print(f" Dates: {season_stats['start_date'].strftime('%Y-%m-%d')} to {season_stats['end_date'].strftime('%Y-%m-%d')}")
|
|
|
|
def main():
|
|
"""Main execution function"""
|
|
analyzer = FourChanAnalyzer()
|
|
|
|
# Fetch and process data
|
|
if not analyzer.fetch_data():
|
|
return
|
|
|
|
analyzer.filter_and_normalize()
|
|
analyzer.print_summary()
|
|
|
|
# Create visualizations
|
|
fig1 = analyzer.plot_seasons_comparison()
|
|
plt.show()
|
|
|
|
fig2 = analyzer.plot_full_timeline()
|
|
plt.show()
|
|
|
|
if __name__ == "__main__":
|
|
main()
|