import hashlib
import json
import pickle
from datetime import datetime
from pathlib import Path
import gradio as gr
import pandas as pd
import plotly.graph_objects as go
from datasets import load_dataset
from tqdm import tqdm
# Cache configuration
global CACHE_DIR
global TASKS_INDEX_FILE
global TASK_DATA_DIR
global DATASET_DATA_DIR
global METRICS_INDEX_FILE
CACHE_DIR = Path("./pwc_cache")
CACHE_DIR.mkdir(exist_ok=True)
# Directory structure for disk-based storage
TASKS_INDEX_FILE = CACHE_DIR / "tasks_index.json" # Small JSON file with task list
TASK_DATA_DIR = CACHE_DIR / "task_data" # Directory for individual task files
DATASET_DATA_DIR = CACHE_DIR / "dataset_data" # Directory for individual dataset files
METRICS_INDEX_FILE = CACHE_DIR / "metrics_index.json" # Metrics metadata
# Create directories
TASK_DATA_DIR.mkdir(exist_ok=True)
DATASET_DATA_DIR.mkdir(exist_ok=True)
def sanitize_filename(name):
"""Convert a string to a safe filename."""
# Replace problematic characters with underscores
safe_name = name.replace('/', '_').replace('\\', '_').replace(':', '_')
safe_name = safe_name.replace('*', '_').replace('?', '_').replace('"', '_')
safe_name = safe_name.replace('<', '_').replace('>', '_').replace('|', '_')
safe_name = safe_name.replace(' ', '_').replace('.', '_')
# Remove multiple underscores and trim
safe_name = '_'.join(filter(None, safe_name.split('_')))
# Limit length to avoid filesystem issues
if len(safe_name) > 200:
# If too long, use first 150 chars + hash of full name
safe_name = safe_name[:150] + '_' + hashlib.md5(name.encode()).hexdigest()[:8]
return safe_name
def get_task_filename(task):
"""Generate a safe filename for a task."""
safe_name = sanitize_filename(task)
return TASK_DATA_DIR / f"task_{safe_name}.pkl"
def get_dataset_filename(task, dataset_name):
"""Generate a safe filename for a dataset."""
safe_task = sanitize_filename(task)
safe_dataset = sanitize_filename(dataset_name)
# Include both task and dataset in filename for clarity
filename = f"data_{safe_task}_{safe_dataset}.pkl"
# If combined name is too long, shorten it
if len(filename) > 255:
# Use shorter version with hash
filename = f"data_{safe_task[:50]}_{safe_dataset[:50]}_{hashlib.md5(f'{task}||{dataset_name}'.encode()).hexdigest()[:8]}.pkl"
return DATASET_DATA_DIR / filename
def cache_exists():
"""Check if cache structure exists."""
print(f"{TASKS_INDEX_FILE =}")
print(f"{METRICS_INDEX_FILE =}")
print(f"{TASKS_INDEX_FILE.exists() =}")
print(f"{METRICS_INDEX_FILE.exists() =}")
return TASKS_INDEX_FILE.exists() and METRICS_INDEX_FILE.exists()
def build_disk_based_cache():
"""Build cache with minimal memory usage - process dataset in streaming fashion."""
import os
print("Michael test", os.path.isdir("./pwc_cache"))
print("=" * 60)
print("=" * 60)
print("Building disk-based cache (one-time operation)...")
print("=" * 60)
# Initialize tracking structures (kept small)
tasks_set = set()
metrics_index = {}
print("\n[1/4] Streaming dataset and building cache...")
# Load dataset in streaming mode to save memory
ds = load_dataset("pwc-archive/evaluation-tables", split="train", streaming=False)
total_items = len(ds)
processed_count = 0
dataset_count = 0
for idx, item in tqdm(enumerate(ds), total=total_items):
# Progress indicator
task = item['task']
if not task:
continue
tasks_set.add(task)
# Load existing task data from disk or create new
task_file = get_task_filename(task)
if task_file.exists():
with open(task_file, 'rb') as f:
task_data = pickle.load(f)
else:
task_data = {
'categories': set(),
'datasets': set(),
'date_range': {'min': None, 'max': None}
}
# Update task data
if item['categories']:
task_data['categories'].update(item['categories'])
# Process datasets
if item['datasets']:
for dataset in item['datasets']:
if not isinstance(dataset, dict) or 'dataset' not in dataset:
continue
dataset_name = dataset['dataset']
dataset_file = get_dataset_filename(task, dataset_name)
# Skip if already processed
if dataset_file.exists():
task_data['datasets'].add(dataset_name)
continue
task_data['datasets'].add(dataset_name)
# Process SOTA data
if 'sota' not in dataset or 'rows' not in dataset['sota']:
continue
models_data = []
for row in dataset['sota']['rows']:
if not isinstance(row, dict):
continue
model_name = row.get('model_name', 'Unknown Model')
# Extract metrics
metrics = {}
if 'metrics' in row and isinstance(row['metrics'], dict):
for metric_name, metric_value in row['metrics'].items():
if metric_value is not None:
metrics[metric_name] = metric_value
# Track metric metadata
if metric_name not in metrics_index:
metrics_index[metric_name] = {
'count': 0,
'is_lower_better': any(kw in metric_name.lower()
for kw in ['error', 'loss', 'time', 'cost'])
}
metrics_index[metric_name]['count'] += 1
# Parse date
paper_date = row.get('paper_date')
try:
if paper_date and isinstance(paper_date, str):
release_date = pd.to_datetime(paper_date)
else:
release_date = pd.to_datetime('2020-01-01')
except:
release_date = pd.to_datetime('2020-01-01')
# Update date range
if task_data['date_range']['min'] is None or release_date < task_data['date_range']['min']:
task_data['date_range']['min'] = release_date
if task_data['date_range']['max'] is None or release_date > task_data['date_range']['max']:
task_data['date_range']['max'] = release_date
# Build model entry
model_entry = {
'model_name': model_name,
'release_date': release_date,
'paper_date': row.get('paper_date', ''), # Store raw paper_date for dynamic parsing
'paper_url': row.get('paper_url', ''),
'paper_title': row.get('paper_title', ''),
'code_url': row.get('code_links', [''])[0] if row.get('code_links') else '',
**metrics
}
models_data.append(model_entry)
if models_data:
df = pd.DataFrame(models_data)
df = df.sort_values('release_date')
# Save dataset to its own file
with open(dataset_file, 'wb') as f:
pickle.dump(df, f, protocol=pickle.HIGHEST_PROTOCOL)
dataset_count += 1
# Clear DataFrame from memory
del df
del models_data
# Save updated task data back to disk
with open(task_file, 'wb') as f:
# Convert sets to lists for serialization
task_data_to_save = {
'categories': sorted(list(task_data['categories'])),
'datasets': sorted(list(task_data['datasets'])),
'date_range': task_data['date_range']
}
pickle.dump(task_data_to_save, f, protocol=pickle.HIGHEST_PROTOCOL)
# Clear task data from memory
del task_data
processed_count += 1
print(f"\n✓ Processed {len(tasks_set)} tasks and {dataset_count} datasets")
print("\n[2/4] Saving index files...")
# Save tasks index (small file)
tasks_list = sorted(list(tasks_set))
with open(TASKS_INDEX_FILE, 'w') as f:
json.dump(tasks_list, f)
print(f" ✓ Saved tasks index ({len(tasks_list)} tasks)")
# Save metrics index
with open(METRICS_INDEX_FILE, 'w') as f:
json.dump(metrics_index, f, indent=2)
print(f" ✓ Saved metrics index ({len(metrics_index)} metrics)")
print("\n[3/4] Calculating cache statistics...")
# Calculate total cache size
total_size = 0
for file in TASK_DATA_DIR.glob("*.pkl"):
total_size += file.stat().st_size
for file in DATASET_DATA_DIR.glob("*.pkl"):
total_size += file.stat().st_size
print(f" ✓ Total cache size: {total_size / 1024 / 1024:.1f} MB")
print(f" ✓ Task files: {len(list(TASK_DATA_DIR.glob('*.pkl')))}")
print(f" ✓ Dataset files: {len(list(DATASET_DATA_DIR.glob('*.pkl')))}")
print("\n[4/4] Cache building complete!")
print("=" * 60)
return tasks_list
def load_tasks_index():
"""Load just the task list from disk."""
with open(TASKS_INDEX_FILE, 'r') as f:
return json.load(f)
def load_task_data(task):
"""Load data for a specific task from disk."""
task_file = get_task_filename(task)
if task_file.exists():
with open(task_file, 'rb') as f:
return pickle.load(f)
return None
def load_dataset_data(task, dataset_name):
"""Load a specific dataset from disk."""
dataset_file = get_dataset_filename(task, dataset_name)
if dataset_file.exists():
with open(dataset_file, 'rb') as f:
return pickle.load(f)
return pd.DataFrame()
def load_metrics_index():
"""Load metrics index from disk."""
if METRICS_INDEX_FILE.exists():
with open(METRICS_INDEX_FILE, 'r') as f:
return json.load(f)
return {}
# Initialize - build cache if doesn't exist
if cache_exists():
print("Loading task index from disk...")
TASKS = load_tasks_index()
print(f"✓ Loaded {len(TASKS)} tasks")
else:
TASKS = build_disk_based_cache()
# Load metrics index once (it's small)
METRICS_INDEX = load_metrics_index()
# Memory-efficient accessor functions
def get_tasks():
"""Get all tasks from index."""
return TASKS
def get_task_data(task):
"""Load task data from disk on-demand."""
return load_task_data(task)
def get_categories(task):
"""Get categories for a task (loads from disk)."""
task_data = get_task_data(task)
return task_data['categories'] if task_data else []
def get_datasets_for_task(task):
"""Get datasets for a task (loads from disk)."""
task_data = get_task_data(task)
return task_data['datasets'] if task_data else []
def get_cached_model_data(task, dataset_name):
"""Load dataset from disk on-demand."""
return load_dataset_data(task, dataset_name)
def parse_paper_date(paper_date, paper_title="", paper_url=""):
"""Parse paper date with improved fallback strategies."""
import re
# Try to parse the raw paper_date if available
if paper_date and isinstance(paper_date, str) and paper_date.strip():
try:
# Try common date formats
date_formats = [
'%Y-%m-%d',
'%Y/%m/%d',
'%d-%m-%Y',
'%d/%m/%Y',
'%Y-%m',
'%Y/%m',
'%Y'
]
for fmt in date_formats:
try:
return pd.to_datetime(paper_date.strip(), format=fmt)
except:
continue
# Try pandas automatic parsing
return pd.to_datetime(paper_date.strip())
except:
pass
# Fallback: try to extract year from paper title or URL
year_pattern = r'\b(19[5-9]\d|20[0-9]\d)\b' # Match 1950-2099
# Look for year in paper title
if paper_title:
years = re.findall(year_pattern, str(paper_title))
if years:
try:
year = max(years) # Use the latest year found
return pd.to_datetime(f'{year}-01-01')
except:
pass
# Look for year in paper URL
if paper_url:
years = re.findall(year_pattern, str(paper_url))
if years:
try:
year = max(years) # Use the latest year found
return pd.to_datetime(f'{year}-01-01')
except:
pass
# Final fallback: return None instead of a default year
return None
def get_task_statistics(task):
"""Get statistics about a task."""
return {}
def create_sota_plot(df, metric):
"""Create a plot showing model performance evolution over time.
Args:
df: DataFrame with model data
metric: Metric name to plot on y-axis
"""
if df.empty or metric not in df.columns:
fig = go.Figure()
fig.add_annotation(
text="No data available for this metric",
xref="paper",
yref="paper",
x=0.5,
y=0.5,
showarrow=False,
font=dict(size=20)
)
fig.update_layout(
title="No Data Available",
height=600,
plot_bgcolor='white',
paper_bgcolor='white'
)
return fig
# Remove rows where the metric is NaN
df_clean = df.dropna(subset=[metric]).copy()
if df_clean.empty:
fig = go.Figure()
fig.add_annotation(
text="No valid data points for this metric",
xref="paper",
yref="paper",
x=0.5,
y=0.5,
showarrow=False,
font=dict(size=20)
)
fig.update_layout(
title="No Data Available",
height=600,
plot_bgcolor='white',
paper_bgcolor='white'
)
return fig
# Convert metric column to numeric, handling any string values
try:
df_clean[metric] = pd.to_numeric(
df_clean[metric].apply(lambda x: x.strip()[:-1] if isinstance(x, str) and x.strip().endswith("%") else x),
errors='coerce')
# Remove any rows that couldn't be converted to numeric
df_clean = df_clean.dropna(subset=[metric])
if df_clean.empty:
fig = go.Figure()
fig.add_annotation(
text=f"No numeric data available for metric: {metric}",
xref="paper",
yref="paper",
x=0.5,
y=0.5,
showarrow=False,
font=dict(size=20)
)
fig.update_layout(
title="No Numeric Data Available",
height=600,
plot_bgcolor='white',
paper_bgcolor='white'
)
return fig
except Exception as e:
fig = go.Figure()
fig.add_annotation(
text=f"Error processing metric data: {str(e)}",
xref="paper",
yref="paper",
x=0.5,
y=0.5,
showarrow=False,
font=dict(size=16)
)
fig.update_layout(
title="Data Processing Error",
height=600,
plot_bgcolor='white',
paper_bgcolor='white'
)
return fig
# Recalculate release dates dynamically from raw paper_date if available
df_processed = df_clean.copy()
if 'paper_date' in df_processed.columns:
# Parse dates dynamically using improved logic
df_processed['dynamic_release_date'] = df_processed.apply(
lambda row: parse_paper_date(
row.get('paper_date', ''),
row.get('paper_title', ''),
row.get('paper_url', '')
), axis=1
)
# Use dynamic dates if available, otherwise fallback to original release_date
df_processed['final_release_date'] = df_processed['dynamic_release_date'].fillna(df_processed['release_date'])
else:
# If no paper_date column, use existing release_date
df_processed['final_release_date'] = df_processed['release_date']
# Filter out rows with no valid date
df_with_dates = df_processed[df_processed['final_release_date'].notna()].copy()
if df_with_dates.empty:
# If no valid dates, return empty plot
fig = go.Figure()
fig.add_annotation(
text="No valid dates available for this dataset",
xref="paper",
yref="paper",
x=0.5,
y=0.5,
showarrow=False,
font=dict(size=20)
)
fig.update_layout(
title="No Date Data Available",
height=600,
plot_bgcolor='white',
paper_bgcolor='white'
)
return fig
# Sort by final release date
df_sorted = df_with_dates.sort_values('final_release_date').copy()
# Check if metric is lower-better
is_lower_better = False
if metric in METRICS_INDEX:
is_lower_better = METRICS_INDEX[metric].get('is_lower_better', False)
else:
is_lower_better = any(keyword in metric.lower() for keyword in ['error', 'loss', 'time', 'cost'])
if is_lower_better:
df_sorted['cumulative_best'] = df_sorted[metric].cummin()
df_sorted['is_sota'] = df_sorted[metric] == df_sorted['cumulative_best']
else:
df_sorted['cumulative_best'] = df_sorted[metric].cummax()
df_sorted['is_sota'] = df_sorted[metric] == df_sorted['cumulative_best']
# Get SOTA models
sota_df = df_sorted[df_sorted['is_sota']].copy()
# Use the dynamically calculated dates for x-axis
x_values = df_sorted['final_release_date']
x_axis_title = 'Release Date'
# Create the plot
fig = go.Figure()
# Add all models as scatter points
fig.add_trace(go.Scatter(
x=x_values,
y=df_sorted[metric],
mode='markers',
name='All models',
marker=dict(
color=['#00CED1' if is_sota else 'lightgray'
for is_sota in df_sorted['is_sota']],
size=8,
opacity=0.7
),
text=df_sorted['model_name'],
customdata=df_sorted[['paper_title', 'paper_url', 'code_url']],
hovertemplate='%{text}
' +
f'{metric}: %{{y:.4f}}
' +
'Date: %{x}
' +
'Paper: %{customdata[0]}
' +
'
{x_axis_title}: %{{x}}