napolab / app.py
ruanchaves's picture
Update app.py
ddc5368 verified
import gradio as gr
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import json
import os
import re
from typing import Dict, List, Optional, Tuple
# Import data loader
from data_loader import data_loader, get_napolab_datasets, get_sample_benchmark_results, get_model_metadata
# Load data from YAML file
NAPOLAB_DATASETS = get_napolab_datasets()
SAMPLE_BENCHMARK_RESULTS = get_sample_benchmark_results()
MODEL_METADATA = get_model_metadata()
def load_portuguese_leaderboard_data() -> pd.DataFrame:
"""Load data from the Portuguese leaderboard CSV file."""
try:
csv_path = "portuguese_leaderboard.csv"
if os.path.exists(csv_path):
df = pd.read_csv(csv_path)
# Select only the relevant columns
relevant_columns = ['model_name', 'model_num_parameters', 'assin2_rte', 'assin2_sts', 'faquad_nli', 'hatebr_offensive']
df = df[relevant_columns].copy()
# Rename columns to match the existing format
df = df.rename(columns={
'assin2_rte': 'ASSIN2 RTE',
'assin2_sts': 'ASSIN2 STS',
'faquad_nli': 'FaQUaD-NLI',
'hatebr_offensive': 'HateBR'
})
# Add source information
df['source'] = 'portuguese_leaderboard'
print(f"Loaded {len(df)} models from Portuguese leaderboard")
return df
else:
print(f"Portuguese leaderboard CSV not found: {csv_path}")
return pd.DataFrame()
except Exception as e:
print(f"Error loading Portuguese leaderboard data: {e}")
return pd.DataFrame()
def load_external_models_data() -> pd.DataFrame:
"""Load data from the external models CSV file."""
try:
csv_path = "external_models.csv"
if os.path.exists(csv_path):
df = pd.read_csv(csv_path)
# Select only the relevant columns
relevant_columns = ['model', 'link', 'assin2_rte', 'assin2_sts', 'faquad_nli', 'hatebr_offensive']
df = df[relevant_columns].copy()
# Rename columns to match the existing format
df = df.rename(columns={
'model': 'model_name',
'assin2_rte': 'ASSIN2 RTE',
'assin2_sts': 'ASSIN2 STS',
'faquad_nli': 'FaQUaD-NLI',
'hatebr_offensive': 'HateBR'
})
# Add source information
df['source'] = 'external_models'
# Add model_num_parameters column with 0 for external models
df['model_num_parameters'] = 0
print(f"Loaded {len(df)} external models")
return df
else:
print(f"External models CSV not found: {csv_path}")
return pd.DataFrame()
except Exception as e:
print(f"Error loading external models data: {e}")
return pd.DataFrame()
# Load Portuguese leaderboard data
PORTUGUESE_LEADERBOARD_DATA = load_portuguese_leaderboard_data()
# Load external models data
EXTERNAL_MODELS_DATA = load_external_models_data()
def create_simplified_benchmark_table(selected_datasets: List[str] = None, show_napolab_thesis: bool = True, show_teenytinyllama: bool = True, show_portuguese_leaderboard: bool = True, show_external_models: bool = True, hide_incomplete_models: bool = False, min_average_performance: float = 0.0, search_query: str = "", max_num_parameters: int = 0) -> pd.DataFrame:
"""Create a simplified benchmark table with one column per dataset."""
# Get all dataset names
dataset_names = sorted(NAPOLAB_DATASETS.keys())
dataset_display_names = [NAPOLAB_DATASETS[name].get('name', name) for name in dataset_names]
# Use selected datasets if provided, otherwise use all datasets
if selected_datasets is None:
selected_datasets = dataset_names
# Collect data for each model
model_data = {}
# Process existing benchmark results
for dataset_name, models in SAMPLE_BENCHMARK_RESULTS.items():
for model_name, metrics in models.items():
if model_name not in model_data:
model_data[model_name] = {
'dataset_scores': {},
'url': None,
'source': 'existing'
}
# Calculate average performance for this dataset
avg_performance = np.mean(list(metrics.values()))
model_data[model_name]['dataset_scores'][dataset_name] = avg_performance
# Process Portuguese leaderboard data
if show_portuguese_leaderboard and not PORTUGUESE_LEADERBOARD_DATA.empty:
for _, row in PORTUGUESE_LEADERBOARD_DATA.iterrows():
model_name = row['model_name']
if model_name not in model_data:
model_data[model_name] = {
'dataset_scores': {},
'url': None,
'source': 'portuguese_leaderboard',
'num_parameters': row.get('model_num_parameters', 0)
}
# Map Portuguese leaderboard columns to dataset names
column_mapping = {
'ASSIN2 RTE': 'assin2_rte',
'ASSIN2 STS': 'assin2_sts',
'FaQUaD-NLI': 'faquad-nli',
'HateBR': 'hatebr'
}
for display_name, dataset_name in column_mapping.items():
if dataset_name in NAPOLAB_DATASETS:
score = row[display_name]
if pd.notna(score) and score > 0:
model_data[model_name]['dataset_scores'][dataset_name] = score
# Process external models data
if show_external_models and not EXTERNAL_MODELS_DATA.empty:
for _, row in EXTERNAL_MODELS_DATA.iterrows():
model_name = row['model_name']
if model_name not in model_data:
model_data[model_name] = {
'dataset_scores': {},
'url': row.get('link', ''),
'source': 'external_models',
'num_parameters': row.get('model_num_parameters', 0)
}
# Map external models columns to dataset names
column_mapping = {
'ASSIN2 RTE': 'assin2_rte',
'ASSIN2 STS': 'assin2_sts',
'FaQUaD-NLI': 'faquad-nli',
'HateBR': 'hatebr'
}
for display_name, dataset_name in column_mapping.items():
if dataset_name in NAPOLAB_DATASETS:
score = row[display_name]
if pd.notna(score) and score > 0:
model_data[model_name]['dataset_scores'][dataset_name] = score
# Get model URLs and source information for existing models
additional_models = data_loader.get_additional_models()
for model_name in model_data.keys():
if model_data[model_name]['source'] == 'existing':
# Get URL
for arch_models in additional_models.values():
if model_name in arch_models:
model_data[model_name]['url'] = arch_models[model_name].get('huggingface_url', '')
break
# Get source information
model_metadata = MODEL_METADATA.get(model_name, {})
source = model_metadata.get('source', 'unknown')
model_data[model_name]['source'] = source
# Add num_parameters for existing models (set to 0 as they don't have this info)
model_data[model_name]['num_parameters'] = 0
# Create table data
table_data = []
for model_name, data in model_data.items():
# Apply source filtering
source = data['source']
# Apply show filters - only show models from sources that are checked
if source == 'napolab_thesis' and not show_napolab_thesis:
continue
if source == 'teenytinyllama_paper' and not show_teenytinyllama:
continue
if source == 'portuguese_leaderboard' and not show_portuguese_leaderboard:
continue
if source == 'external_models' and not show_external_models:
continue
# Hide models with unknown source (should not happen with proper data)
if source == 'unknown':
continue
# Apply parameter filtering (only for Portuguese leaderboard models)
if max_num_parameters > 0 and source == 'portuguese_leaderboard':
num_parameters = data.get('num_parameters', 0)
if num_parameters > max_num_parameters:
continue
# Create clickable link for model name
if data['url']:
model_display = f"[{model_name}]({data['url']})"
elif source == 'portuguese_leaderboard' and '/' in model_name:
# Create Hugging Face link for Portuguese leaderboard models with slashes
huggingface_url = f"https://huggingface.co/{model_name}"
model_display = f"[{model_name}]({huggingface_url})"
else:
model_display = model_name
# Create row with dataset scores
row_data = {'Model': model_display}
# Calculate average only over selected datasets
selected_scores = []
for dataset_name in selected_datasets:
score = data['dataset_scores'].get(dataset_name, 0)
if score > 0: # Only include non-zero scores in average
selected_scores.append(score)
overall_avg = np.mean(selected_scores) if selected_scores else 0
row_data['Average'] = round(overall_avg, 4)
# Add scores for each dataset (only selected ones)
for dataset_name in dataset_names:
score = data['dataset_scores'].get(dataset_name, 0)
display_name = dataset_display_names[dataset_names.index(dataset_name)]
# Only add columns for selected datasets
if dataset_name in selected_datasets:
row_data[display_name] = round(score, 4)
table_data.append(row_data)
df = pd.DataFrame(table_data)
# Filter to show only models that have scores for at least one selected dataset
if selected_datasets and not df.empty:
# Get display names for selected datasets
selected_display_names = [NAPOLAB_DATASETS[name].get('name', name) for name in selected_datasets]
# Filter models based on selection criteria
models_to_keep = []
for _, row in df.iterrows():
has_score = False
has_all_scores = True
# Only check the datasets that are actually selected for display
for dataset_name in selected_datasets:
display_name = NAPOLAB_DATASETS[dataset_name].get('name', dataset_name)
if display_name in df.columns:
score = row[display_name]
if score > 0:
has_score = True
else:
has_all_scores = False
# Keep model if it has at least one score
if has_score:
# If hide_incomplete_models is True, only keep models with all scores in selected datasets
if not hide_incomplete_models or has_all_scores:
models_to_keep.append(row['Model'])
# Filter dataframe to only include selected models
if models_to_keep:
df = df[df['Model'].isin(models_to_keep)]
else:
# If no models to keep, create empty DataFrame with proper structure
# Create columns list first
columns = ['Model']
for dataset_name in dataset_names:
display_name = dataset_display_names[dataset_names.index(dataset_name)]
if dataset_name in selected_datasets:
columns.append(display_name)
columns.append('Average')
# Create empty DataFrame with correct columns
df = pd.DataFrame(columns=columns)
# Filter by minimum average performance
if min_average_performance > 0 and not df.empty:
df = df[df['Average'] >= min_average_performance]
# Filter by search query
if search_query and not df.empty:
# Extract model names from markdown links for searching
df_filtered = df.copy()
df_filtered['model_name_clean'] = df_filtered['Model'].str.replace(r'\[([^\]]+)\]\([^)]+\)', r'\1', regex=True)
try:
# Use regex pattern matching
df_filtered = df_filtered[df_filtered['model_name_clean'].str.contains(search_query, case=False, na=False, regex=True)]
except re.error:
# Fallback to simple string matching if regex is invalid
df_filtered = df_filtered[df_filtered['model_name_clean'].str.contains(search_query, case=False, na=False)]
df = df_filtered.drop('model_name_clean', axis=1)
# Sort by Average (descending)
if not df.empty:
df = df.sort_values('Average', ascending=False)
# Add rank column with medal emojis for top 3 and color-coded emojis for others
if not df.empty:
df = df.reset_index(drop=True)
df.index = df.index + 1 # Start ranking from 1
# Create rank column with medal emojis and color-coded emojis
rank_column = []
total_models = len(df)
for rank in df.index:
if rank == 1:
rank_column.append("🥇 1")
elif rank == 2:
rank_column.append("🥈 2")
elif rank == 3:
rank_column.append("🥉 3")
else:
# Color-code based on position relative to total
position_ratio = rank / total_models
if position_ratio <= 0.33: # Top third
rank_column.append("🟢 " + str(rank))
elif position_ratio <= 0.67: # Middle third
rank_column.append("🟡 " + str(rank))
else: # Bottom third
rank_column.append("🔴 " + str(rank))
df.insert(0, 'Rank', rank_column)
return df
# Global variable to track the current CSV file
current_csv_file = None
def export_csv(df: pd.DataFrame):
"""Export the benchmark table to CSV."""
global current_csv_file
print(f"Export function called with dataframe shape: {df.shape}")
if df.empty:
print("Dataframe is empty, returning None")
return None
# Clean up previous file if it exists
if current_csv_file:
try:
import os
if os.path.exists(current_csv_file):
os.remove(current_csv_file)
print(f"Deleted previous CSV file: {current_csv_file}")
except Exception as e:
print(f"Error deleting previous file {current_csv_file}: {e}")
# Clean the dataframe for CSV export
df_clean = df.copy()
# Remove markdown formatting from model names for cleaner CSV
df_clean['Model'] = df_clean['Model'].str.replace(r'\[([^\]]+)\]\([^)]+\)', r'\1', regex=True)
# Create filename with timestamp
from datetime import datetime
import tempfile
import os
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"napolab_benchmark_results_{timestamp}.csv"
# Create file in current directory (simpler approach)
file_path = filename
print(f"Creating CSV file at: {file_path}")
# Save to CSV file
df_clean.to_csv(file_path, index=False)
print(f"CSV file created successfully. File exists: {os.path.exists(file_path)}")
# Update current file tracking
current_csv_file = file_path
return file_path
def cleanup_current_csv():
"""Clean up the current CSV file after download."""
global current_csv_file
import os
if current_csv_file and os.path.exists(current_csv_file):
try:
os.remove(current_csv_file)
print(f"Deleted CSV file after download: {current_csv_file}")
current_csv_file = None
except Exception as e:
print(f"Error deleting file {current_csv_file}: {e}")
def create_model_performance_radar(selected_datasets: List[str] = None, show_napolab_thesis: bool = True, show_teenytinyllama: bool = True, show_portuguese_leaderboard: bool = True, show_external_models: bool = True, hide_incomplete_models: bool = False, min_average_performance: float = 0.0, search_query: str = "", max_num_parameters: int = 0) -> go.Figure:
"""Create a radar chart showing model performance across all datasets."""
# Use selected datasets if provided, otherwise use all datasets
if selected_datasets is None:
selected_datasets = list(NAPOLAB_DATASETS.keys())
# Get dataset names for the radar axes (only selected ones)
dataset_names = selected_datasets
dataset_display_names = [NAPOLAB_DATASETS[name].get('name', name) for name in dataset_names]
# Collect data for each model
model_data = {}
# Process existing benchmark results
for dataset_name, models in SAMPLE_BENCHMARK_RESULTS.items():
if dataset_name in selected_datasets:
for model_name, metrics in models.items():
if model_name not in model_data:
model_data[model_name] = {
'performances': {},
'architecture': MODEL_METADATA.get(model_name, {}).get('architecture', 'Unknown'),
'source': 'existing'
}
# Calculate average performance for this dataset
avg_performance = np.mean(list(metrics.values()))
model_data[model_name]['performances'][dataset_name] = avg_performance
# Process Portuguese leaderboard data
if show_portuguese_leaderboard and not PORTUGUESE_LEADERBOARD_DATA.empty:
for _, row in PORTUGUESE_LEADERBOARD_DATA.iterrows():
model_name = row['model_name']
if model_name not in model_data:
model_data[model_name] = {
'performances': {},
'architecture': 'Unknown',
'source': 'portuguese_leaderboard',
'num_parameters': row.get('model_num_parameters', 0)
}
# Map Portuguese leaderboard columns to dataset names
column_mapping = {
'ASSIN2 RTE': 'assin2_rte',
'ASSIN2 STS': 'assin2_sts',
'FaQUaD-NLI': 'faquad-nli',
'HateBR': 'hatebr'
}
for display_name, dataset_name in column_mapping.items():
if dataset_name in selected_datasets:
score = row[display_name]
if pd.notna(score) and score > 0:
model_data[model_name]['performances'][dataset_name] = score
# Process external models data
if show_external_models and not EXTERNAL_MODELS_DATA.empty:
for _, row in EXTERNAL_MODELS_DATA.iterrows():
model_name = row['model_name']
if model_name not in model_data:
model_data[model_name] = {
'performances': {},
'architecture': 'Unknown',
'source': 'external_models',
'num_parameters': row.get('model_num_parameters', 0)
}
# Map external models columns to dataset names
column_mapping = {
'ASSIN2 RTE': 'assin2_rte',
'ASSIN2 STS': 'assin2_sts',
'FaQUaD-NLI': 'faquad-nli',
'HateBR': 'hatebr'
}
for display_name, dataset_name in column_mapping.items():
if dataset_name in selected_datasets:
score = row[display_name]
if pd.notna(score) and score > 0:
model_data[model_name]['performances'][dataset_name] = score
# Get model URLs and source information for existing models
additional_models = data_loader.get_additional_models()
for model_name in model_data.keys():
if model_data[model_name]['source'] == 'existing':
# Get URL
for arch_models in additional_models.values():
if model_name in arch_models:
model_data[model_name]['url'] = arch_models[model_name].get('huggingface_url', '')
break
# Get source information
model_metadata = MODEL_METADATA.get(model_name, {})
source = model_metadata.get('source', 'unknown')
model_data[model_name]['source'] = source
# Add num_parameters for existing models (set to 0 as they don't have this info)
model_data[model_name]['num_parameters'] = 0
# Apply source filtering
filtered_model_data = {}
for model_name, data in model_data.items():
source = data.get('source', 'existing')
# Apply show filters - only show models from sources that are checked
if source == 'napolab_thesis' and not show_napolab_thesis:
continue
if source == 'teenytinyllama_paper' and not show_teenytinyllama:
continue
if source == 'portuguese_leaderboard' and not show_portuguese_leaderboard:
continue
if source == 'external_models' and not show_external_models:
continue
# Hide models with unknown source (should not happen with proper data)
if source == 'unknown':
continue
# Apply parameter filtering (only for Portuguese leaderboard models)
if max_num_parameters > 0 and source == 'portuguese_leaderboard':
num_parameters = data.get('num_parameters', 0)
if num_parameters > max_num_parameters:
continue
filtered_model_data[model_name] = data
# Apply incomplete model filtering
if hide_incomplete_models and selected_datasets:
final_filtered_data = {}
for model_name, data in filtered_model_data.items():
has_all_scores = True
for dataset_name in selected_datasets:
if data['performances'].get(dataset_name, 0) == 0:
has_all_scores = False
break
if has_all_scores:
final_filtered_data[model_name] = data
filtered_model_data = final_filtered_data
# Apply minimum average performance filtering
if min_average_performance > 0 and selected_datasets:
final_filtered_data = {}
for model_name, data in filtered_model_data.items():
# Calculate average performance for selected datasets
scores = []
for dataset_name in selected_datasets:
score = data['performances'].get(dataset_name, 0)
if score > 0: # Only include non-zero scores
scores.append(score)
if scores:
avg_performance = np.mean(scores)
if avg_performance >= min_average_performance:
final_filtered_data[model_name] = data
filtered_model_data = final_filtered_data
# Apply search query filtering
if search_query:
final_filtered_data = {}
try:
# Use regex pattern matching
import re
pattern = re.compile(search_query, re.IGNORECASE)
for model_name, data in filtered_model_data.items():
if pattern.search(model_name):
final_filtered_data[model_name] = data
except re.error:
# Fallback to simple string matching if regex is invalid
for model_name, data in filtered_model_data.items():
if search_query.lower() in model_name.lower():
final_filtered_data[model_name] = data
filtered_model_data = final_filtered_data
# Sort models by average performance (descending)
model_performances = []
for model_name, data in filtered_model_data.items():
# Calculate average performance for selected datasets
scores = []
for dataset_name in selected_datasets:
score = data['performances'].get(dataset_name, 0)
if score > 0: # Only include non-zero scores
scores.append(score)
avg_performance = np.mean(scores) if scores else 0
model_performances.append((model_name, data, avg_performance))
# Sort by average performance (descending)
model_performances.sort(key=lambda x: x[2], reverse=True)
# Calculate dynamic range based on actual data
all_performance_values = []
for model_name, data, avg_performance in model_performances:
for dataset_name in dataset_names:
score = data['performances'].get(dataset_name, 0)
if score > 0: # Only include non-zero scores
all_performance_values.append(score)
# Set dynamic range with some padding
if all_performance_values:
min_score = min(all_performance_values)
max_score = max(all_performance_values)
# Add 5% padding below minimum and ensure minimum is not below 0.5
range_min = max(0.5, min_score - (max_score - min_score) * 0.05)
range_max = 1.0
else:
# Fallback to default range if no data
range_min = 0.6
range_max = 1.0
# Create radar chart
fig = go.Figure()
# Generate a more distinguishable color palette
num_models = len(model_performances)
# Create a list of line styles for better differentiation
line_styles = ['solid', 'dash', 'dot', 'dashdot', 'longdash', 'longdashdot']
# Use highly contrasting colors for better differentiation
base_colors = [
'#1f77b4', # Blue
'#ff7f0e', # Orange
'#2ca02c', # Green
'#d62728', # Red
'#9467bd', # Purple
'#8c564b', # Brown
'#e377c2', # Pink
'#7f7f7f', # Gray
'#bcbd22', # Olive
'#17becf', # Cyan
'#ff9896', # Light Red
'#98df8a', # Light Green
'#ffbb78', # Light Orange
'#aec7e8', # Light Blue
'#c5b0d5', # Light Purple
]
# Ensure we have enough colors
while len(base_colors) < num_models:
base_colors.extend(base_colors)
colors = base_colors[:num_models]
for i, (model_name, data, avg_performance) in enumerate(model_performances):
# Get performance values for all datasets (fill with 0 if missing)
performance_values = []
for dataset_name in dataset_names:
performance_values.append(data['performances'].get(dataset_name, 0))
# Close the polygon by adding the first value at the end
if performance_values:
performance_values.append(performance_values[0])
# Assign color and line style based on model index for better differentiation
color = colors[i % len(colors)]
line_style = line_styles[i % len(line_styles)]
# Show first two models by default, hide the rest
visible = True if i < 2 else 'legendonly'
# Create theta values that close the polygon
theta_values = dataset_display_names + [dataset_display_names[0]] if dataset_display_names else []
fig.add_trace(go.Scatterpolar(
r=performance_values,
theta=theta_values,
fill=None,
name=model_name,
line_color=color,
line_dash=line_style,
line_width=3,
opacity=0.8,
visible=visible,
hovertemplate=(
"<b>%{fullData.name}</b><br>" +
"Dataset: %{theta}<br>" +
"Performance: %{r:.3f}<br>" +
"Architecture: " + data['architecture'] + "<br>" +
"<extra></extra>"
)
))
# Update layout
fig.update_layout(
title="Model Performance Radar Chart",
polar=dict(
radialaxis=dict(
visible=True,
range=[range_min, range_max],
gridcolor='rgba(0, 0, 0, 0.2)',
linecolor='rgba(0, 0, 0, 0.5)',
tickcolor='rgba(0, 0, 0, 0.7)',
tickfont=dict(color='rgba(0, 0, 0, 0.8)')
),
angularaxis=dict(
tickmode='array',
tickvals=list(range(len(dataset_display_names))),
ticktext=dataset_display_names,
gridcolor='rgba(0, 0, 0, 0.2)',
linecolor='rgba(0, 0, 0, 0.5)',
tickcolor='rgba(0, 0, 0, 0.7)',
tickfont=dict(color='rgba(0, 0, 0, 0.8)')
),
bgcolor='rgba(255, 255, 255, 0)'
),
height=700,
showlegend=True,
plot_bgcolor='rgba(255, 255, 255, 0)',
paper_bgcolor='rgba(255, 255, 255, 0)',
legend=dict(
yanchor="top",
y=-0.15,
xanchor="center",
x=0.5,
bgcolor='rgba(255, 255, 255, 0.95)',
bordercolor='rgba(0, 0, 0, 0.2)',
borderwidth=1,
orientation="h",
font=dict(color='rgba(0, 0, 0, 0.8)')
),
margin=dict(l=50, r=50, t=100, b=100),
font=dict(color='rgba(0, 0, 0, 0.8)')
)
return fig
# Gradio Interface
with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
gr.Markdown("""
# 🌎 Napolab Leaderboard
Stay up to date with the latest advancements in Portuguese language models and their performance across carefully curated Portuguese language tasks.
[⭐ Star us on GitHub](https://github.com/ruanchaves/napolab)
""")
with gr.Tabs():
# Benchmark Results Tab
with gr.Tab("🏆 Benchmark Results"):
gr.Markdown("### Model Performance Benchmarks")
with gr.Accordion("Select Datasets to Include: (Click to expand)", open=False):
with gr.Row():
# Create checkboxes for each dataset
dataset_checkboxes = []
for dataset_name in sorted(NAPOLAB_DATASETS.keys()):
display_name = NAPOLAB_DATASETS[dataset_name].get('name', dataset_name)
# Default to selected only for ASSIN 2 STS, FaQUaD-NLI, and HateBR
default_value = display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR']
checkbox = gr.Checkbox(
label=display_name,
value=default_value
)
dataset_checkboxes.append((dataset_name, checkbox))
with gr.Accordion("Filter by Score: (Click to expand)", open=False):
with gr.Row():
hide_incomplete_models = gr.Checkbox(
label="Hide models with zero scores in selected datasets",
value=True
)
min_average_performance = gr.Slider(
minimum=0,
maximum=100,
value=80,
step=1,
label="Minimum Average Performance (%)"
)
with gr.Accordion("Filter by Data Source: (Click to expand)", open=False):
with gr.Row():
show_napolab_thesis = gr.Checkbox(
label="Napolab Thesis models",
value=True
)
show_teenytinyllama = gr.Checkbox(
label="TeenyTinyLlama models",
value=True
)
show_portuguese_leaderboard = gr.Checkbox(
label="Open Portuguese LLM Leaderboard models (open-source)",
value=True
)
show_external_models = gr.Checkbox(
label="Open Portuguese LLM Leaderboard models (proprietary)",
value=True
)
# Calculate max parameters for slider
max_params = 0
if not PORTUGUESE_LEADERBOARD_DATA.empty:
max_params = int(PORTUGUESE_LEADERBOARD_DATA['model_num_parameters'].max())
with gr.Accordion("Filter by Model Size: (Click to expand)", open=False):
with gr.Row():
max_num_parameters = gr.Slider(
minimum=0,
maximum=max_params,
value=0,
step=1,
label="Maximum Number of Parameters",
info="This slider is applicable only to Open PT LLM Leaderboard models. For other models, it will have no effect."
)
# Search bar for filtering models
search_query = gr.Textbox(
label="Search models by name (supports regex)",
placeholder="Enter model name or regex pattern to filter...",
value="",
info="Supports regular expressions. Examples: 'bert.*large', 'gemini|gpt', 'mdeberta.*', '^bert'"
)
benchmark_table = gr.DataFrame(
label="Model Performance Benchmarks",
wrap=[True, False, False, False, False, False, False, False, False, False],
interactive=False,
datatype=["str", "markdown", "number", "number", "number", "number", "number", "number", "number", "number"],
column_widths=["80px", "200px", "100px", "120px", "120px", "120px", "120px", "120px", "120px", "120px"]
)
gr.Markdown("*🥇🥈🥉 = Top 3 | 🟢 = Top 33% | 🟡 = Middle 33% | 🔴 = Bottom 33%*")
# Export to CSV button and file component
export_button = gr.Button("📥 Export to CSV", variant="secondary")
csv_file = gr.File(label="Download CSV", interactive=False, visible=True)
# Model Analysis Tab
with gr.Tab("📈 Model Analysis"):
gr.Markdown("### Model Performance Radar Chart")
# Dataset Selection Controls
with gr.Accordion("Select Datasets to Display: (Click to expand)", open=False):
with gr.Row():
# Create checkboxes for each dataset
analysis_dataset_checkboxes = []
for dataset_name in sorted(NAPOLAB_DATASETS.keys()):
display_name = NAPOLAB_DATASETS[dataset_name].get('name', dataset_name)
# Default to selected only for ASSIN 2 STS, FaQUaD-NLI, and HateBR
default_value = display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR']
checkbox = gr.Checkbox(
label=display_name,
value=default_value
)
analysis_dataset_checkboxes.append((dataset_name, checkbox))
# Filter Controls
with gr.Accordion("Filter by Score: (Click to expand)", open=False):
with gr.Row():
hide_incomplete_models_analysis = gr.Checkbox(
label="Hide models with zero scores in selected datasets",
value=True
)
min_average_performance_analysis = gr.Slider(
minimum=0,
maximum=100,
value=80,
step=1,
label="Minimum Average Performance (%)"
)
with gr.Accordion("Filter by Data Source: (Click to expand)", open=False):
with gr.Row():
show_napolab_thesis_analysis = gr.Checkbox(
label="Napolab Thesis models",
value=True
)
show_teenytinyllama_analysis = gr.Checkbox(
label="TeenyTinyLlama models",
value=True
)
show_portuguese_leaderboard_analysis = gr.Checkbox(
label="Open Portuguese LLM Leaderboard models (open-source)",
value=True
)
show_external_models_analysis = gr.Checkbox(
label="Open Portuguese LLM Leaderboard models (proprietary)",
value=True
)
# Parameter slider for Model Analysis tab
with gr.Accordion("Filter by Model Size: (Click to expand)", open=False):
with gr.Row():
max_num_parameters_analysis = gr.Slider(
minimum=0,
maximum=max_params,
value=0,
step=1,
label="Maximum Number of Parameters",
info="This slider is applicable only to Open PT LLM Leaderboard models. For other models, it will have no effect."
)
# Search bar for filtering models in radar chart
search_query_analysis = gr.Textbox(
label="Search models by name (supports regex)",
placeholder="Enter model name or regex pattern to filter...",
value="",
info="Supports regular expressions. Examples: 'bert.*large', 'gemini|gpt', 'mdeberta.*', '^bert'"
)
model_analysis_chart = gr.Plot(label="Model Performance Radar Chart")
# Add scatter plot below radar chart
model_scatter_plot = gr.Plot(label="Model Performance vs Number of Parameters")
gr.Markdown("""
**How to interact with the chart:**
- **Click on legend items** to show/hide specific models.
- **Double-click on a legend item** to isolate that model (hide all others).
- **Double-click again** to show all models.
Models in the legend are sorted in descending order based on their average performance across your chosen datasets.
""")
# About Tab
with gr.Tab("ℹ️ About"):
gr.Markdown("""
## About Napolab
**Natural Portuguese Language Benchmark (Napolab)** is a comprehensive collection of Portuguese datasets designed for evaluating Large Language Models.
- [GitHub repository](https://github.com/ruanchaves/napolab)
- [Hugging Face Dataset](https://huggingface.co/datasets/ruanchaves/napolab)
- Article: ["The Hidden Truth About LLM Performance: Why Your Benchmark Results Might Be Misleading"](https://ruanchaves.medium.com/the-hidden-truth-about-llm-performance-why-your-benchmark-results-might-be-misleading-afd24f40a46c)
### Data Sources:
The benchmark results and model evaluations presented in this leaderboard are compiled from multiple sources:
**1. "Lessons learned from the evaluation of Portuguese language models"** by Ruan Chaves Rodrigues (2023). Available at: [University of Malta OAR@UM Repository](https://www.um.edu.mt/library/oar/handle/123456789/120557)
**2. Open PT LLM Leaderboard** by Eduardo Garcia (2025). Available at: [Hugging Face Spaces](https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard).
**3. "TeenyTinyLlama: Open-source tiny language models trained in Brazilian Portuguese"** by Corrêa et al. (2024). Available at: [arXiv](https://arxiv.org/abs/2401.16640).
### Thesis Citation:
```bibtex
@mastersthesis{chaves2023lessons,
title={Lessons learned from the evaluation of Portuguese language models},
author={Chaves Rodrigues, Ruan},
year={2023},
school={University of Malta},
url={https://www.um.edu.mt/library/oar/handle/123456789/120557}
}
```
### Napolab Citation:
```bibtex
@software{Chaves_Rodrigues_napolab_2023,
author = {Chaves Rodrigues, Ruan and Tanti, Marc and Agerri, Rodrigo},
doi = {10.5281/zenodo.7781848},
month = {3},
title = {{Natural Portuguese Language Benchmark (Napolab)}},
url = {https://github.com/ruanchaves/napolab},
version = {1.0.0},
year = {2023}
}
```
""")
def create_model_performance_scatter(selected_datasets: List[str] = None, show_napolab_thesis: bool = True, show_teenytinyllama: bool = True, show_portuguese_leaderboard: bool = True, show_external_models: bool = True, hide_incomplete_models: bool = False, min_average_performance: float = 0.0, search_query: str = "", max_num_parameters: int = 0) -> go.Figure:
"""Create a scatter plot showing model performance vs number of parameters."""
# Use selected datasets if provided, otherwise use all datasets
if selected_datasets is None:
selected_datasets = list(NAPOLAB_DATASETS.keys())
# Collect data for each model
model_data = {}
# Process existing benchmark results
for dataset_name, models in SAMPLE_BENCHMARK_RESULTS.items():
if dataset_name in selected_datasets:
for model_name, metrics in models.items():
if model_name not in model_data:
# Get actual source from MODEL_METADATA
model_metadata = MODEL_METADATA.get(model_name, {})
actual_source = model_metadata.get('source', 'unknown')
model_data[model_name] = {
'performances': {},
'architecture': model_metadata.get('architecture', 'Unknown'),
'source': actual_source,
'num_parameters': 0
}
# Calculate average performance for this dataset
avg_performance = np.mean(list(metrics.values()))
model_data[model_name]['performances'][dataset_name] = avg_performance
# Process Portuguese leaderboard data
if show_portuguese_leaderboard and not PORTUGUESE_LEADERBOARD_DATA.empty:
for _, row in PORTUGUESE_LEADERBOARD_DATA.iterrows():
model_name = row['model_name']
if model_name not in model_data:
model_data[model_name] = {
'performances': {},
'architecture': 'Unknown',
'source': 'portuguese_leaderboard',
'num_parameters': row.get('model_num_parameters', 0)
}
# Map Portuguese leaderboard columns to dataset names
column_mapping = {
'ASSIN2 RTE': 'assin2_rte',
'ASSIN2 STS': 'assin2_sts',
'FaQUaD-NLI': 'faquad-nli',
'HateBR': 'hatebr'
}
for display_name, dataset_name in column_mapping.items():
if dataset_name in selected_datasets:
score = row[display_name]
if pd.notna(score) and score > 0:
model_data[model_name]['performances'][dataset_name] = score
# Process external models data
if show_external_models and not EXTERNAL_MODELS_DATA.empty:
for _, row in EXTERNAL_MODELS_DATA.iterrows():
model_name = row['model_name']
if model_name not in model_data:
model_data[model_name] = {
'performances': {},
'architecture': 'Unknown',
'source': 'external_models',
'num_parameters': row.get('model_num_parameters', 0)
}
# Map external models columns to dataset names
column_mapping = {
'ASSIN2 RTE': 'assin2_rte',
'ASSIN2 STS': 'assin2_sts',
'FaQUaD-NLI': 'faquad-nli',
'HateBR': 'hatebr'
}
for display_name, dataset_name in column_mapping.items():
if dataset_name in selected_datasets:
score = row[display_name]
if pd.notna(score) and score > 0:
model_data[model_name]['performances'][dataset_name] = score
# Apply source filtering
filtered_model_data = {}
for model_name, data in model_data.items():
source = data.get('source', 'existing')
# Apply show filters - only show models from sources that are checked
if source == 'napolab_thesis' and not show_napolab_thesis:
continue
if source == 'teenytinyllama_paper' and not show_teenytinyllama:
continue
if source == 'portuguese_leaderboard' and not show_portuguese_leaderboard:
continue
if source == 'external_models' and not show_external_models:
continue
# Hide models with unknown source (should not happen with proper data)
if source == 'unknown':
continue
# Apply parameter filtering (only for Portuguese leaderboard models)
if max_num_parameters > 0 and source == 'portuguese_leaderboard':
num_parameters = data.get('num_parameters', 0)
if num_parameters > max_num_parameters:
continue
filtered_model_data[model_name] = data
# Apply incomplete model filtering
if hide_incomplete_models and selected_datasets:
final_filtered_data = {}
for model_name, data in filtered_model_data.items():
has_all_scores = True
for dataset_name in selected_datasets:
if data['performances'].get(dataset_name, 0) == 0:
has_all_scores = False
break
if has_all_scores:
final_filtered_data[model_name] = data
filtered_model_data = final_filtered_data
# Apply minimum average performance filtering
if min_average_performance > 0 and selected_datasets:
final_filtered_data = {}
for model_name, data in filtered_model_data.items():
# Calculate average performance for selected datasets
scores = []
for dataset_name in selected_datasets:
score = data['performances'].get(dataset_name, 0)
if score > 0: # Only include non-zero scores
scores.append(score)
if scores:
avg_performance = np.mean(scores)
if avg_performance >= min_average_performance:
final_filtered_data[model_name] = data
filtered_model_data = final_filtered_data
# Apply search query filtering
if search_query:
final_filtered_data = {}
try:
# Use regex pattern matching
import re
pattern = re.compile(search_query, re.IGNORECASE)
for model_name, data in filtered_model_data.items():
if pattern.search(model_name):
final_filtered_data[model_name] = data
except re.error:
# Fallback to simple string matching if regex is invalid
for model_name, data in filtered_model_data.items():
if search_query.lower() in model_name.lower():
final_filtered_data[model_name] = data
filtered_model_data = final_filtered_data
# Prepare data for scatter plot
scatter_data = []
for model_name, data in filtered_model_data.items():
# Calculate average performance for selected datasets
scores = []
for dataset_name in selected_datasets:
score = data['performances'].get(dataset_name, 0)
if score > 0: # Only include non-zero scores
scores.append(score)
if scores:
avg_performance = np.mean(scores)
num_parameters = data.get('num_parameters', 0)
source = data.get('source', 'unknown')
scatter_data.append({
'model_name': model_name,
'avg_performance': avg_performance,
'num_parameters': num_parameters,
'source': source
})
if not scatter_data:
# Create empty figure if no data
fig = go.Figure()
fig.add_annotation(
text="No data available for the selected filters",
xref="paper", yref="paper",
x=0.5, y=0.5, showarrow=False,
font=dict(size=16)
)
fig.update_layout(
title="Model Performance vs Number of Parameters",
xaxis_title="Number of Parameters",
yaxis_title="Average Performance Score",
height=500
)
return fig
# Create scatter plot
df_scatter = pd.DataFrame(scatter_data)
# Create color mapping for sources
color_map = {
'portuguese_leaderboard': '#1f77b4',
'external_models': '#ff7f0e',
'napolab_thesis': '#2ca02c',
'teenytinyllama_paper': '#d62728',
'unknown': '#9467bd'
}
# Create display name mapping for sources
display_name_map = {
'portuguese_leaderboard': 'Open PT LLM Leaderboard',
'external_models': 'Proprietary Models',
'napolab_thesis': 'Napolab Thesis',
'teenytinyllama_paper': 'TeenyTinyLlama Paper',
'unknown': 'Unknown Source'
}
fig = go.Figure()
for source in df_scatter['source'].unique():
source_data = df_scatter[df_scatter['source'] == source]
color = color_map.get(source, '#7f7f7f')
display_name = display_name_map.get(source, source.replace('_', ' ').title())
fig.add_trace(go.Scatter(
x=source_data['num_parameters'],
y=source_data['avg_performance'],
mode='markers',
name=display_name,
marker=dict(
color=color,
size=8,
opacity=0.7
),
text=source_data['model_name'],
hovertemplate=(
"<b>%{text}</b><br>" +
"Average Performance: %{y:.3f}<br>" +
"Number of Parameters: %{x:,}<br>" +
"Source: " + display_name + "<br>" +
"<extra></extra>"
)
))
fig.update_layout(
title="Model Performance vs Number of Parameters",
xaxis_title="Number of Parameters",
yaxis_title="Average Performance Score",
height=500,
showlegend=True,
plot_bgcolor='rgba(255, 255, 255, 0)',
paper_bgcolor='rgba(255, 255, 255, 0)',
legend=dict(
yanchor="top",
y=-0.15,
xanchor="center",
x=0.5,
bgcolor='rgba(255, 255, 255, 0.95)',
bordercolor='rgba(0, 0, 0, 0.2)',
borderwidth=1,
orientation="h"
),
margin=dict(l=50, r=50, t=100, b=100)
)
return fig
# Event handlers
def update_radar_chart(*args):
# Extract arguments for radar chart
dataset_values = args[:len(analysis_dataset_checkboxes)]
hide_incomplete_models = args[len(analysis_dataset_checkboxes)]
min_average_performance = args[len(analysis_dataset_checkboxes) + 1] / 100.0 # Convert percentage to decimal
show_napolab_thesis = args[len(analysis_dataset_checkboxes) + 2]
show_teenytinyllama = args[len(analysis_dataset_checkboxes) + 3]
show_portuguese_leaderboard = args[len(analysis_dataset_checkboxes) + 4]
show_external_models = args[len(analysis_dataset_checkboxes) + 5]
search_query = args[len(analysis_dataset_checkboxes) + 6]
max_num_parameters = args[len(analysis_dataset_checkboxes) + 7]
# Convert dataset selections to list of selected dataset names
selected_datasets = []
for i, (dataset_name, _) in enumerate(analysis_dataset_checkboxes):
if dataset_values[i]:
selected_datasets.append(dataset_name)
return create_model_performance_radar(selected_datasets, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, hide_incomplete_models, min_average_performance, search_query, max_num_parameters)
def update_benchmark_table(*args):
# Extract arguments
dataset_values = args[:len(dataset_checkboxes)]
hide_incomplete_models = args[len(dataset_checkboxes)]
min_average_performance = args[len(dataset_checkboxes) + 1] / 100.0 # Convert percentage to decimal
show_napolab_thesis = args[len(dataset_checkboxes) + 2]
show_teenytinyllama = args[len(dataset_checkboxes) + 3]
show_portuguese_leaderboard = args[len(dataset_checkboxes) + 4]
show_external_models = args[len(dataset_checkboxes) + 5]
search_query = args[len(dataset_checkboxes) + 6]
max_num_parameters = args[len(dataset_checkboxes) + 7]
# Convert dataset selections to list of selected dataset names
selected_datasets = []
for i, (dataset_name, _) in enumerate(dataset_checkboxes):
if dataset_values[i]:
selected_datasets.append(dataset_name)
df = create_simplified_benchmark_table(selected_datasets, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, hide_incomplete_models, min_average_performance, search_query, max_num_parameters)
return df
def update_scatter_plot(*args):
# Extract arguments for scatter plot
dataset_values = args[:len(analysis_dataset_checkboxes)]
hide_incomplete_models = args[len(analysis_dataset_checkboxes)]
min_average_performance = args[len(analysis_dataset_checkboxes) + 1] / 100.0 # Convert percentage to decimal
show_napolab_thesis = args[len(analysis_dataset_checkboxes) + 2]
show_teenytinyllama = args[len(analysis_dataset_checkboxes) + 3]
show_portuguese_leaderboard = args[len(analysis_dataset_checkboxes) + 4]
show_external_models = args[len(analysis_dataset_checkboxes) + 5]
search_query = args[len(analysis_dataset_checkboxes) + 6]
max_num_parameters = args[len(analysis_dataset_checkboxes) + 7]
# Convert dataset selections to list of selected dataset names
selected_datasets = []
for i, (dataset_name, _) in enumerate(analysis_dataset_checkboxes):
if dataset_values[i]:
selected_datasets.append(dataset_name)
return create_model_performance_scatter(selected_datasets, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, hide_incomplete_models, min_average_performance, search_query, max_num_parameters)
# Connect dataset checkboxes to update table
for dataset_name, checkbox in dataset_checkboxes:
checkbox.change(
update_benchmark_table,
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
outputs=benchmark_table
)
hide_incomplete_models.change(
update_benchmark_table,
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
outputs=benchmark_table
)
min_average_performance.change(
update_benchmark_table,
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
outputs=benchmark_table
)
show_napolab_thesis.change(
update_benchmark_table,
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
outputs=benchmark_table
)
show_teenytinyllama.change(
update_benchmark_table,
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
outputs=benchmark_table
)
show_portuguese_leaderboard.change(
update_benchmark_table,
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
outputs=benchmark_table
)
show_external_models.change(
update_benchmark_table,
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
outputs=benchmark_table
)
# Connect search query to update table
search_query.change(
update_benchmark_table,
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
outputs=benchmark_table
)
# Connect max_num_parameters to update table
max_num_parameters.change(
update_benchmark_table,
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
outputs=benchmark_table
)
# Connect export button
export_button.click(
export_csv,
inputs=benchmark_table,
outputs=csv_file
)
# Connect file download to cleanup
csv_file.change(
cleanup_current_csv,
inputs=None,
outputs=None
)
# Connect analysis chart events
# Connect dataset checkboxes to update radar chart
for dataset_name, checkbox in analysis_dataset_checkboxes:
checkbox.change(
update_radar_chart,
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
outputs=model_analysis_chart
)
hide_incomplete_models_analysis.change(
update_radar_chart,
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
outputs=model_analysis_chart
)
min_average_performance_analysis.change(
update_radar_chart,
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
outputs=model_analysis_chart
)
show_napolab_thesis_analysis.change(
update_radar_chart,
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
outputs=model_analysis_chart
)
show_teenytinyllama_analysis.change(
update_radar_chart,
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
outputs=model_analysis_chart
)
show_portuguese_leaderboard_analysis.change(
update_radar_chart,
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
outputs=model_analysis_chart
)
show_external_models_analysis.change(
update_radar_chart,
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
outputs=model_analysis_chart
)
# Connect search query to update radar chart
search_query_analysis.change(
update_radar_chart,
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
outputs=model_analysis_chart
)
# Connect max_num_parameters_analysis to update radar chart
max_num_parameters_analysis.change(
update_radar_chart,
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
outputs=model_analysis_chart
)
# Connect all analysis controls to update scatter plot
for dataset_name, checkbox in analysis_dataset_checkboxes:
checkbox.change(
update_scatter_plot,
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
outputs=model_scatter_plot
)
hide_incomplete_models_analysis.change(
update_scatter_plot,
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
outputs=model_scatter_plot
)
min_average_performance_analysis.change(
update_scatter_plot,
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
outputs=model_scatter_plot
)
show_napolab_thesis_analysis.change(
update_scatter_plot,
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
outputs=model_scatter_plot
)
show_teenytinyllama_analysis.change(
update_scatter_plot,
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
outputs=model_scatter_plot
)
show_portuguese_leaderboard_analysis.change(
update_scatter_plot,
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
outputs=model_scatter_plot
)
show_external_models_analysis.change(
update_scatter_plot,
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
outputs=model_scatter_plot
)
search_query_analysis.change(
update_scatter_plot,
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
outputs=model_scatter_plot
)
max_num_parameters_analysis.change(
update_scatter_plot,
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
outputs=model_scatter_plot
)
# Connect events
# Load model analysis chart on app start
app.load(lambda: update_radar_chart(*([display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR'] for _, display_name in [(name, NAPOLAB_DATASETS[name].get('name', name)) for name in sorted(NAPOLAB_DATASETS.keys())]] + [True, 80, True, True, True, True, "", 0])), outputs=model_analysis_chart)
# Load scatter plot on app start
app.load(lambda: update_scatter_plot(*([display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR'] for _, display_name in [(name, NAPOLAB_DATASETS[name].get('name', name)) for name in sorted(NAPOLAB_DATASETS.keys())]] + [True, 80, True, True, True, True, "", 0])), outputs=model_scatter_plot)
# Load benchmark table on app start
app.load(lambda: update_benchmark_table(*([display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR'] for _, display_name in [(name, NAPOLAB_DATASETS[name].get('name', name)) for name in sorted(NAPOLAB_DATASETS.keys())]] + [True, 80, True, True, True, True, "", 0])), outputs=benchmark_table)
if __name__ == "__main__":
app.launch(server_name="0.0.0.0", server_port=7860)