import gradio as gr import pandas as pd import numpy as np import plotly.express as px import plotly.graph_objects as go from plotly.subplots import make_subplots import json import os import re from typing import Dict, List, Optional, Tuple # Import data loader from data_loader import data_loader, get_napolab_datasets, get_sample_benchmark_results, get_model_metadata # Load data from YAML file NAPOLAB_DATASETS = get_napolab_datasets() SAMPLE_BENCHMARK_RESULTS = get_sample_benchmark_results() MODEL_METADATA = get_model_metadata() def load_portuguese_leaderboard_data() -> pd.DataFrame: """Load data from the Portuguese leaderboard CSV file.""" try: csv_path = "portuguese_leaderboard.csv" if os.path.exists(csv_path): df = pd.read_csv(csv_path) # Select only the relevant columns relevant_columns = ['model_name', 'model_num_parameters', 'assin2_rte', 'assin2_sts', 'faquad_nli', 'hatebr_offensive'] df = df[relevant_columns].copy() # Rename columns to match the existing format df = df.rename(columns={ 'assin2_rte': 'ASSIN2 RTE', 'assin2_sts': 'ASSIN2 STS', 'faquad_nli': 'FaQUaD-NLI', 'hatebr_offensive': 'HateBR' }) # Add source information df['source'] = 'portuguese_leaderboard' print(f"Loaded {len(df)} models from Portuguese leaderboard") return df else: print(f"Portuguese leaderboard CSV not found: {csv_path}") return pd.DataFrame() except Exception as e: print(f"Error loading Portuguese leaderboard data: {e}") return pd.DataFrame() def load_external_models_data() -> pd.DataFrame: """Load data from the external models CSV file.""" try: csv_path = "external_models.csv" if os.path.exists(csv_path): df = pd.read_csv(csv_path) # Select only the relevant columns relevant_columns = ['model', 'link', 'assin2_rte', 'assin2_sts', 'faquad_nli', 'hatebr_offensive'] df = df[relevant_columns].copy() # Rename columns to match the existing format df = df.rename(columns={ 'model': 'model_name', 'assin2_rte': 'ASSIN2 RTE', 'assin2_sts': 'ASSIN2 STS', 'faquad_nli': 'FaQUaD-NLI', 'hatebr_offensive': 'HateBR' }) # Add source information df['source'] = 'external_models' # Add model_num_parameters column with 0 for external models df['model_num_parameters'] = 0 print(f"Loaded {len(df)} external models") return df else: print(f"External models CSV not found: {csv_path}") return pd.DataFrame() except Exception as e: print(f"Error loading external models data: {e}") return pd.DataFrame() # Load Portuguese leaderboard data PORTUGUESE_LEADERBOARD_DATA = load_portuguese_leaderboard_data() # Load external models data EXTERNAL_MODELS_DATA = load_external_models_data() def create_simplified_benchmark_table(selected_datasets: List[str] = None, show_napolab_thesis: bool = True, show_teenytinyllama: bool = True, show_portuguese_leaderboard: bool = True, show_external_models: bool = True, hide_incomplete_models: bool = False, min_average_performance: float = 0.0, search_query: str = "", max_num_parameters: int = 0) -> pd.DataFrame: """Create a simplified benchmark table with one column per dataset.""" # Get all dataset names dataset_names = sorted(NAPOLAB_DATASETS.keys()) dataset_display_names = [NAPOLAB_DATASETS[name].get('name', name) for name in dataset_names] # Use selected datasets if provided, otherwise use all datasets if selected_datasets is None: selected_datasets = dataset_names # Collect data for each model model_data = {} # Process existing benchmark results for dataset_name, models in SAMPLE_BENCHMARK_RESULTS.items(): for model_name, metrics in models.items(): if model_name not in model_data: model_data[model_name] = { 'dataset_scores': {}, 'url': None, 'source': 'existing' } # Calculate average performance for this dataset avg_performance = np.mean(list(metrics.values())) model_data[model_name]['dataset_scores'][dataset_name] = avg_performance # Process Portuguese leaderboard data if show_portuguese_leaderboard and not PORTUGUESE_LEADERBOARD_DATA.empty: for _, row in PORTUGUESE_LEADERBOARD_DATA.iterrows(): model_name = row['model_name'] if model_name not in model_data: model_data[model_name] = { 'dataset_scores': {}, 'url': None, 'source': 'portuguese_leaderboard', 'num_parameters': row.get('model_num_parameters', 0) } # Map Portuguese leaderboard columns to dataset names column_mapping = { 'ASSIN2 RTE': 'assin2_rte', 'ASSIN2 STS': 'assin2_sts', 'FaQUaD-NLI': 'faquad-nli', 'HateBR': 'hatebr' } for display_name, dataset_name in column_mapping.items(): if dataset_name in NAPOLAB_DATASETS: score = row[display_name] if pd.notna(score) and score > 0: model_data[model_name]['dataset_scores'][dataset_name] = score # Process external models data if show_external_models and not EXTERNAL_MODELS_DATA.empty: for _, row in EXTERNAL_MODELS_DATA.iterrows(): model_name = row['model_name'] if model_name not in model_data: model_data[model_name] = { 'dataset_scores': {}, 'url': row.get('link', ''), 'source': 'external_models', 'num_parameters': row.get('model_num_parameters', 0) } # Map external models columns to dataset names column_mapping = { 'ASSIN2 RTE': 'assin2_rte', 'ASSIN2 STS': 'assin2_sts', 'FaQUaD-NLI': 'faquad-nli', 'HateBR': 'hatebr' } for display_name, dataset_name in column_mapping.items(): if dataset_name in NAPOLAB_DATASETS: score = row[display_name] if pd.notna(score) and score > 0: model_data[model_name]['dataset_scores'][dataset_name] = score # Get model URLs and source information for existing models additional_models = data_loader.get_additional_models() for model_name in model_data.keys(): if model_data[model_name]['source'] == 'existing': # Get URL for arch_models in additional_models.values(): if model_name in arch_models: model_data[model_name]['url'] = arch_models[model_name].get('huggingface_url', '') break # Get source information model_metadata = MODEL_METADATA.get(model_name, {}) source = model_metadata.get('source', 'unknown') model_data[model_name]['source'] = source # Add num_parameters for existing models (set to 0 as they don't have this info) model_data[model_name]['num_parameters'] = 0 # Create table data table_data = [] for model_name, data in model_data.items(): # Apply source filtering source = data['source'] # Apply show filters - only show models from sources that are checked if source == 'napolab_thesis' and not show_napolab_thesis: continue if source == 'teenytinyllama_paper' and not show_teenytinyllama: continue if source == 'portuguese_leaderboard' and not show_portuguese_leaderboard: continue if source == 'external_models' and not show_external_models: continue # Hide models with unknown source (should not happen with proper data) if source == 'unknown': continue # Apply parameter filtering (only for Portuguese leaderboard models) if max_num_parameters > 0 and source == 'portuguese_leaderboard': num_parameters = data.get('num_parameters', 0) if num_parameters > max_num_parameters: continue # Create clickable link for model name if data['url']: model_display = f"[{model_name}]({data['url']})" elif source == 'portuguese_leaderboard' and '/' in model_name: # Create Hugging Face link for Portuguese leaderboard models with slashes huggingface_url = f"https://huggingface.co/{model_name}" model_display = f"[{model_name}]({huggingface_url})" else: model_display = model_name # Create row with dataset scores row_data = {'Model': model_display} # Calculate average only over selected datasets selected_scores = [] for dataset_name in selected_datasets: score = data['dataset_scores'].get(dataset_name, 0) if score > 0: # Only include non-zero scores in average selected_scores.append(score) overall_avg = np.mean(selected_scores) if selected_scores else 0 row_data['Average'] = round(overall_avg, 4) # Add scores for each dataset (only selected ones) for dataset_name in dataset_names: score = data['dataset_scores'].get(dataset_name, 0) display_name = dataset_display_names[dataset_names.index(dataset_name)] # Only add columns for selected datasets if dataset_name in selected_datasets: row_data[display_name] = round(score, 4) table_data.append(row_data) df = pd.DataFrame(table_data) # Filter to show only models that have scores for at least one selected dataset if selected_datasets and not df.empty: # Get display names for selected datasets selected_display_names = [NAPOLAB_DATASETS[name].get('name', name) for name in selected_datasets] # Filter models based on selection criteria models_to_keep = [] for _, row in df.iterrows(): has_score = False has_all_scores = True # Only check the datasets that are actually selected for display for dataset_name in selected_datasets: display_name = NAPOLAB_DATASETS[dataset_name].get('name', dataset_name) if display_name in df.columns: score = row[display_name] if score > 0: has_score = True else: has_all_scores = False # Keep model if it has at least one score if has_score: # If hide_incomplete_models is True, only keep models with all scores in selected datasets if not hide_incomplete_models or has_all_scores: models_to_keep.append(row['Model']) # Filter dataframe to only include selected models if models_to_keep: df = df[df['Model'].isin(models_to_keep)] else: # If no models to keep, create empty DataFrame with proper structure # Create columns list first columns = ['Model'] for dataset_name in dataset_names: display_name = dataset_display_names[dataset_names.index(dataset_name)] if dataset_name in selected_datasets: columns.append(display_name) columns.append('Average') # Create empty DataFrame with correct columns df = pd.DataFrame(columns=columns) # Filter by minimum average performance if min_average_performance > 0 and not df.empty: df = df[df['Average'] >= min_average_performance] # Filter by search query if search_query and not df.empty: # Extract model names from markdown links for searching df_filtered = df.copy() df_filtered['model_name_clean'] = df_filtered['Model'].str.replace(r'\[([^\]]+)\]\([^)]+\)', r'\1', regex=True) try: # Use regex pattern matching df_filtered = df_filtered[df_filtered['model_name_clean'].str.contains(search_query, case=False, na=False, regex=True)] except re.error: # Fallback to simple string matching if regex is invalid df_filtered = df_filtered[df_filtered['model_name_clean'].str.contains(search_query, case=False, na=False)] df = df_filtered.drop('model_name_clean', axis=1) # Sort by Average (descending) if not df.empty: df = df.sort_values('Average', ascending=False) # Add rank column with medal emojis for top 3 and color-coded emojis for others if not df.empty: df = df.reset_index(drop=True) df.index = df.index + 1 # Start ranking from 1 # Create rank column with medal emojis and color-coded emojis rank_column = [] total_models = len(df) for rank in df.index: if rank == 1: rank_column.append("πŸ₯‡ 1") elif rank == 2: rank_column.append("πŸ₯ˆ 2") elif rank == 3: rank_column.append("πŸ₯‰ 3") else: # Color-code based on position relative to total position_ratio = rank / total_models if position_ratio <= 0.33: # Top third rank_column.append("🟒 " + str(rank)) elif position_ratio <= 0.67: # Middle third rank_column.append("🟑 " + str(rank)) else: # Bottom third rank_column.append("πŸ”΄ " + str(rank)) df.insert(0, 'Rank', rank_column) return df # Global variable to track the current CSV file current_csv_file = None def export_csv(df: pd.DataFrame): """Export the benchmark table to CSV.""" global current_csv_file print(f"Export function called with dataframe shape: {df.shape}") if df.empty: print("Dataframe is empty, returning None") return None # Clean up previous file if it exists if current_csv_file: try: import os if os.path.exists(current_csv_file): os.remove(current_csv_file) print(f"Deleted previous CSV file: {current_csv_file}") except Exception as e: print(f"Error deleting previous file {current_csv_file}: {e}") # Clean the dataframe for CSV export df_clean = df.copy() # Remove markdown formatting from model names for cleaner CSV df_clean['Model'] = df_clean['Model'].str.replace(r'\[([^\]]+)\]\([^)]+\)', r'\1', regex=True) # Create filename with timestamp from datetime import datetime import tempfile import os timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"napolab_benchmark_results_{timestamp}.csv" # Create file in current directory (simpler approach) file_path = filename print(f"Creating CSV file at: {file_path}") # Save to CSV file df_clean.to_csv(file_path, index=False) print(f"CSV file created successfully. File exists: {os.path.exists(file_path)}") # Update current file tracking current_csv_file = file_path return file_path def cleanup_current_csv(): """Clean up the current CSV file after download.""" global current_csv_file import os if current_csv_file and os.path.exists(current_csv_file): try: os.remove(current_csv_file) print(f"Deleted CSV file after download: {current_csv_file}") current_csv_file = None except Exception as e: print(f"Error deleting file {current_csv_file}: {e}") def create_model_performance_radar(selected_datasets: List[str] = None, show_napolab_thesis: bool = True, show_teenytinyllama: bool = True, show_portuguese_leaderboard: bool = True, show_external_models: bool = True, hide_incomplete_models: bool = False, min_average_performance: float = 0.0, search_query: str = "", max_num_parameters: int = 0) -> go.Figure: """Create a radar chart showing model performance across all datasets.""" # Use selected datasets if provided, otherwise use all datasets if selected_datasets is None: selected_datasets = list(NAPOLAB_DATASETS.keys()) # Get dataset names for the radar axes (only selected ones) dataset_names = selected_datasets dataset_display_names = [NAPOLAB_DATASETS[name].get('name', name) for name in dataset_names] # Collect data for each model model_data = {} # Process existing benchmark results for dataset_name, models in SAMPLE_BENCHMARK_RESULTS.items(): if dataset_name in selected_datasets: for model_name, metrics in models.items(): if model_name not in model_data: model_data[model_name] = { 'performances': {}, 'architecture': MODEL_METADATA.get(model_name, {}).get('architecture', 'Unknown'), 'source': 'existing' } # Calculate average performance for this dataset avg_performance = np.mean(list(metrics.values())) model_data[model_name]['performances'][dataset_name] = avg_performance # Process Portuguese leaderboard data if show_portuguese_leaderboard and not PORTUGUESE_LEADERBOARD_DATA.empty: for _, row in PORTUGUESE_LEADERBOARD_DATA.iterrows(): model_name = row['model_name'] if model_name not in model_data: model_data[model_name] = { 'performances': {}, 'architecture': 'Unknown', 'source': 'portuguese_leaderboard', 'num_parameters': row.get('model_num_parameters', 0) } # Map Portuguese leaderboard columns to dataset names column_mapping = { 'ASSIN2 RTE': 'assin2_rte', 'ASSIN2 STS': 'assin2_sts', 'FaQUaD-NLI': 'faquad-nli', 'HateBR': 'hatebr' } for display_name, dataset_name in column_mapping.items(): if dataset_name in selected_datasets: score = row[display_name] if pd.notna(score) and score > 0: model_data[model_name]['performances'][dataset_name] = score # Process external models data if show_external_models and not EXTERNAL_MODELS_DATA.empty: for _, row in EXTERNAL_MODELS_DATA.iterrows(): model_name = row['model_name'] if model_name not in model_data: model_data[model_name] = { 'performances': {}, 'architecture': 'Unknown', 'source': 'external_models', 'num_parameters': row.get('model_num_parameters', 0) } # Map external models columns to dataset names column_mapping = { 'ASSIN2 RTE': 'assin2_rte', 'ASSIN2 STS': 'assin2_sts', 'FaQUaD-NLI': 'faquad-nli', 'HateBR': 'hatebr' } for display_name, dataset_name in column_mapping.items(): if dataset_name in selected_datasets: score = row[display_name] if pd.notna(score) and score > 0: model_data[model_name]['performances'][dataset_name] = score # Get model URLs and source information for existing models additional_models = data_loader.get_additional_models() for model_name in model_data.keys(): if model_data[model_name]['source'] == 'existing': # Get URL for arch_models in additional_models.values(): if model_name in arch_models: model_data[model_name]['url'] = arch_models[model_name].get('huggingface_url', '') break # Get source information model_metadata = MODEL_METADATA.get(model_name, {}) source = model_metadata.get('source', 'unknown') model_data[model_name]['source'] = source # Add num_parameters for existing models (set to 0 as they don't have this info) model_data[model_name]['num_parameters'] = 0 # Apply source filtering filtered_model_data = {} for model_name, data in model_data.items(): source = data.get('source', 'existing') # Apply show filters - only show models from sources that are checked if source == 'napolab_thesis' and not show_napolab_thesis: continue if source == 'teenytinyllama_paper' and not show_teenytinyllama: continue if source == 'portuguese_leaderboard' and not show_portuguese_leaderboard: continue if source == 'external_models' and not show_external_models: continue # Hide models with unknown source (should not happen with proper data) if source == 'unknown': continue # Apply parameter filtering (only for Portuguese leaderboard models) if max_num_parameters > 0 and source == 'portuguese_leaderboard': num_parameters = data.get('num_parameters', 0) if num_parameters > max_num_parameters: continue filtered_model_data[model_name] = data # Apply incomplete model filtering if hide_incomplete_models and selected_datasets: final_filtered_data = {} for model_name, data in filtered_model_data.items(): has_all_scores = True for dataset_name in selected_datasets: if data['performances'].get(dataset_name, 0) == 0: has_all_scores = False break if has_all_scores: final_filtered_data[model_name] = data filtered_model_data = final_filtered_data # Apply minimum average performance filtering if min_average_performance > 0 and selected_datasets: final_filtered_data = {} for model_name, data in filtered_model_data.items(): # Calculate average performance for selected datasets scores = [] for dataset_name in selected_datasets: score = data['performances'].get(dataset_name, 0) if score > 0: # Only include non-zero scores scores.append(score) if scores: avg_performance = np.mean(scores) if avg_performance >= min_average_performance: final_filtered_data[model_name] = data filtered_model_data = final_filtered_data # Apply search query filtering if search_query: final_filtered_data = {} try: # Use regex pattern matching import re pattern = re.compile(search_query, re.IGNORECASE) for model_name, data in filtered_model_data.items(): if pattern.search(model_name): final_filtered_data[model_name] = data except re.error: # Fallback to simple string matching if regex is invalid for model_name, data in filtered_model_data.items(): if search_query.lower() in model_name.lower(): final_filtered_data[model_name] = data filtered_model_data = final_filtered_data # Sort models by average performance (descending) model_performances = [] for model_name, data in filtered_model_data.items(): # Calculate average performance for selected datasets scores = [] for dataset_name in selected_datasets: score = data['performances'].get(dataset_name, 0) if score > 0: # Only include non-zero scores scores.append(score) avg_performance = np.mean(scores) if scores else 0 model_performances.append((model_name, data, avg_performance)) # Sort by average performance (descending) model_performances.sort(key=lambda x: x[2], reverse=True) # Calculate dynamic range based on actual data all_performance_values = [] for model_name, data, avg_performance in model_performances: for dataset_name in dataset_names: score = data['performances'].get(dataset_name, 0) if score > 0: # Only include non-zero scores all_performance_values.append(score) # Set dynamic range with some padding if all_performance_values: min_score = min(all_performance_values) max_score = max(all_performance_values) # Add 5% padding below minimum and ensure minimum is not below 0.5 range_min = max(0.5, min_score - (max_score - min_score) * 0.05) range_max = 1.0 else: # Fallback to default range if no data range_min = 0.6 range_max = 1.0 # Create radar chart fig = go.Figure() # Generate a more distinguishable color palette num_models = len(model_performances) # Create a list of line styles for better differentiation line_styles = ['solid', 'dash', 'dot', 'dashdot', 'longdash', 'longdashdot'] # Use highly contrasting colors for better differentiation base_colors = [ '#1f77b4', # Blue '#ff7f0e', # Orange '#2ca02c', # Green '#d62728', # Red '#9467bd', # Purple '#8c564b', # Brown '#e377c2', # Pink '#7f7f7f', # Gray '#bcbd22', # Olive '#17becf', # Cyan '#ff9896', # Light Red '#98df8a', # Light Green '#ffbb78', # Light Orange '#aec7e8', # Light Blue '#c5b0d5', # Light Purple ] # Ensure we have enough colors while len(base_colors) < num_models: base_colors.extend(base_colors) colors = base_colors[:num_models] for i, (model_name, data, avg_performance) in enumerate(model_performances): # Get performance values for all datasets (fill with 0 if missing) performance_values = [] for dataset_name in dataset_names: performance_values.append(data['performances'].get(dataset_name, 0)) # Close the polygon by adding the first value at the end if performance_values: performance_values.append(performance_values[0]) # Assign color and line style based on model index for better differentiation color = colors[i % len(colors)] line_style = line_styles[i % len(line_styles)] # Show first two models by default, hide the rest visible = True if i < 2 else 'legendonly' # Create theta values that close the polygon theta_values = dataset_display_names + [dataset_display_names[0]] if dataset_display_names else [] fig.add_trace(go.Scatterpolar( r=performance_values, theta=theta_values, fill=None, name=model_name, line_color=color, line_dash=line_style, line_width=3, opacity=0.8, visible=visible, hovertemplate=( "%{fullData.name}
" + "Dataset: %{theta}
" + "Performance: %{r:.3f}
" + "Architecture: " + data['architecture'] + "
" + "" ) )) # Update layout fig.update_layout( title="Model Performance Radar Chart", polar=dict( radialaxis=dict( visible=True, range=[range_min, range_max], gridcolor='rgba(0, 0, 0, 0.2)', linecolor='rgba(0, 0, 0, 0.5)', tickcolor='rgba(0, 0, 0, 0.7)', tickfont=dict(color='rgba(0, 0, 0, 0.8)') ), angularaxis=dict( tickmode='array', tickvals=list(range(len(dataset_display_names))), ticktext=dataset_display_names, gridcolor='rgba(0, 0, 0, 0.2)', linecolor='rgba(0, 0, 0, 0.5)', tickcolor='rgba(0, 0, 0, 0.7)', tickfont=dict(color='rgba(0, 0, 0, 0.8)') ), bgcolor='rgba(255, 255, 255, 0)' ), height=700, showlegend=True, plot_bgcolor='rgba(255, 255, 255, 0)', paper_bgcolor='rgba(255, 255, 255, 0)', legend=dict( yanchor="top", y=-0.15, xanchor="center", x=0.5, bgcolor='rgba(255, 255, 255, 0.95)', bordercolor='rgba(0, 0, 0, 0.2)', borderwidth=1, orientation="h", font=dict(color='rgba(0, 0, 0, 0.8)') ), margin=dict(l=50, r=50, t=100, b=100), font=dict(color='rgba(0, 0, 0, 0.8)') ) return fig # Gradio Interface with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app: gr.Markdown(""" # 🌎 Napolab Leaderboard Stay up to date with the latest advancements in Portuguese language models and their performance across carefully curated Portuguese language tasks. [⭐ Star us on GitHub](https://github.com/ruanchaves/napolab) """) with gr.Tabs(): # Benchmark Results Tab with gr.Tab("πŸ† Benchmark Results"): gr.Markdown("### Model Performance Benchmarks") with gr.Accordion("Select Datasets to Include: (Click to expand)", open=False): with gr.Row(): # Create checkboxes for each dataset dataset_checkboxes = [] for dataset_name in sorted(NAPOLAB_DATASETS.keys()): display_name = NAPOLAB_DATASETS[dataset_name].get('name', dataset_name) # Default to selected only for ASSIN 2 STS, FaQUaD-NLI, and HateBR default_value = display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR'] checkbox = gr.Checkbox( label=display_name, value=default_value ) dataset_checkboxes.append((dataset_name, checkbox)) with gr.Accordion("Filter by Score: (Click to expand)", open=False): with gr.Row(): hide_incomplete_models = gr.Checkbox( label="Hide models with zero scores in selected datasets", value=True ) min_average_performance = gr.Slider( minimum=0, maximum=100, value=80, step=1, label="Minimum Average Performance (%)" ) with gr.Accordion("Filter by Data Source: (Click to expand)", open=False): with gr.Row(): show_napolab_thesis = gr.Checkbox( label="Napolab Thesis models", value=True ) show_teenytinyllama = gr.Checkbox( label="TeenyTinyLlama models", value=True ) show_portuguese_leaderboard = gr.Checkbox( label="Open Portuguese LLM Leaderboard models (open-source)", value=True ) show_external_models = gr.Checkbox( label="Open Portuguese LLM Leaderboard models (proprietary)", value=True ) # Calculate max parameters for slider max_params = 0 if not PORTUGUESE_LEADERBOARD_DATA.empty: max_params = int(PORTUGUESE_LEADERBOARD_DATA['model_num_parameters'].max()) with gr.Accordion("Filter by Model Size: (Click to expand)", open=False): with gr.Row(): max_num_parameters = gr.Slider( minimum=0, maximum=max_params, value=0, step=1, label="Maximum Number of Parameters", info="This slider is applicable only to Open PT LLM Leaderboard models. For other models, it will have no effect." ) # Search bar for filtering models search_query = gr.Textbox( label="Search models by name (supports regex)", placeholder="Enter model name or regex pattern to filter...", value="", info="Supports regular expressions. Examples: 'bert.*large', 'gemini|gpt', 'mdeberta.*', '^bert'" ) benchmark_table = gr.DataFrame( label="Model Performance Benchmarks", wrap=[True, False, False, False, False, False, False, False, False, False], interactive=False, datatype=["str", "markdown", "number", "number", "number", "number", "number", "number", "number", "number"], column_widths=["80px", "200px", "100px", "120px", "120px", "120px", "120px", "120px", "120px", "120px"] ) gr.Markdown("*πŸ₯‡πŸ₯ˆπŸ₯‰ = Top 3 | 🟒 = Top 33% | 🟑 = Middle 33% | πŸ”΄ = Bottom 33%*") # Export to CSV button and file component export_button = gr.Button("πŸ“₯ Export to CSV", variant="secondary") csv_file = gr.File(label="Download CSV", interactive=False, visible=True) # Model Analysis Tab with gr.Tab("πŸ“ˆ Model Analysis"): gr.Markdown("### Model Performance Radar Chart") # Dataset Selection Controls with gr.Accordion("Select Datasets to Display: (Click to expand)", open=False): with gr.Row(): # Create checkboxes for each dataset analysis_dataset_checkboxes = [] for dataset_name in sorted(NAPOLAB_DATASETS.keys()): display_name = NAPOLAB_DATASETS[dataset_name].get('name', dataset_name) # Default to selected only for ASSIN 2 STS, FaQUaD-NLI, and HateBR default_value = display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR'] checkbox = gr.Checkbox( label=display_name, value=default_value ) analysis_dataset_checkboxes.append((dataset_name, checkbox)) # Filter Controls with gr.Accordion("Filter by Score: (Click to expand)", open=False): with gr.Row(): hide_incomplete_models_analysis = gr.Checkbox( label="Hide models with zero scores in selected datasets", value=True ) min_average_performance_analysis = gr.Slider( minimum=0, maximum=100, value=80, step=1, label="Minimum Average Performance (%)" ) with gr.Accordion("Filter by Data Source: (Click to expand)", open=False): with gr.Row(): show_napolab_thesis_analysis = gr.Checkbox( label="Napolab Thesis models", value=True ) show_teenytinyllama_analysis = gr.Checkbox( label="TeenyTinyLlama models", value=True ) show_portuguese_leaderboard_analysis = gr.Checkbox( label="Open Portuguese LLM Leaderboard models (open-source)", value=True ) show_external_models_analysis = gr.Checkbox( label="Open Portuguese LLM Leaderboard models (proprietary)", value=True ) # Parameter slider for Model Analysis tab with gr.Accordion("Filter by Model Size: (Click to expand)", open=False): with gr.Row(): max_num_parameters_analysis = gr.Slider( minimum=0, maximum=max_params, value=0, step=1, label="Maximum Number of Parameters", info="This slider is applicable only to Open PT LLM Leaderboard models. For other models, it will have no effect." ) # Search bar for filtering models in radar chart search_query_analysis = gr.Textbox( label="Search models by name (supports regex)", placeholder="Enter model name or regex pattern to filter...", value="", info="Supports regular expressions. Examples: 'bert.*large', 'gemini|gpt', 'mdeberta.*', '^bert'" ) model_analysis_chart = gr.Plot(label="Model Performance Radar Chart") # Add scatter plot below radar chart model_scatter_plot = gr.Plot(label="Model Performance vs Number of Parameters") gr.Markdown(""" **How to interact with the chart:** - **Click on legend items** to show/hide specific models. - **Double-click on a legend item** to isolate that model (hide all others). - **Double-click again** to show all models. Models in the legend are sorted in descending order based on their average performance across your chosen datasets. """) # About Tab with gr.Tab("ℹ️ About"): gr.Markdown(""" ## About Napolab **Natural Portuguese Language Benchmark (Napolab)** is a comprehensive collection of Portuguese datasets designed for evaluating Large Language Models. - [GitHub repository](https://github.com/ruanchaves/napolab) - [Hugging Face Dataset](https://huggingface.co/datasets/ruanchaves/napolab) - Article: ["The Hidden Truth About LLM Performance: Why Your Benchmark Results Might Be Misleading"](https://ruanchaves.medium.com/the-hidden-truth-about-llm-performance-why-your-benchmark-results-might-be-misleading-afd24f40a46c) ### Data Sources: The benchmark results and model evaluations presented in this leaderboard are compiled from multiple sources: **1. "Lessons learned from the evaluation of Portuguese language models"** by Ruan Chaves Rodrigues (2023). Available at: [University of Malta OAR@UM Repository](https://www.um.edu.mt/library/oar/handle/123456789/120557) **2. Open PT LLM Leaderboard** by Eduardo Garcia (2025). Available at: [Hugging Face Spaces](https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard). **3. "TeenyTinyLlama: Open-source tiny language models trained in Brazilian Portuguese"** by CorrΓͺa et al. (2024). Available at: [arXiv](https://arxiv.org/abs/2401.16640). ### Thesis Citation: ```bibtex @mastersthesis{chaves2023lessons, title={Lessons learned from the evaluation of Portuguese language models}, author={Chaves Rodrigues, Ruan}, year={2023}, school={University of Malta}, url={https://www.um.edu.mt/library/oar/handle/123456789/120557} } ``` ### Napolab Citation: ```bibtex @software{Chaves_Rodrigues_napolab_2023, author = {Chaves Rodrigues, Ruan and Tanti, Marc and Agerri, Rodrigo}, doi = {10.5281/zenodo.7781848}, month = {3}, title = {{Natural Portuguese Language Benchmark (Napolab)}}, url = {https://github.com/ruanchaves/napolab}, version = {1.0.0}, year = {2023} } ``` """) def create_model_performance_scatter(selected_datasets: List[str] = None, show_napolab_thesis: bool = True, show_teenytinyllama: bool = True, show_portuguese_leaderboard: bool = True, show_external_models: bool = True, hide_incomplete_models: bool = False, min_average_performance: float = 0.0, search_query: str = "", max_num_parameters: int = 0) -> go.Figure: """Create a scatter plot showing model performance vs number of parameters.""" # Use selected datasets if provided, otherwise use all datasets if selected_datasets is None: selected_datasets = list(NAPOLAB_DATASETS.keys()) # Collect data for each model model_data = {} # Process existing benchmark results for dataset_name, models in SAMPLE_BENCHMARK_RESULTS.items(): if dataset_name in selected_datasets: for model_name, metrics in models.items(): if model_name not in model_data: # Get actual source from MODEL_METADATA model_metadata = MODEL_METADATA.get(model_name, {}) actual_source = model_metadata.get('source', 'unknown') model_data[model_name] = { 'performances': {}, 'architecture': model_metadata.get('architecture', 'Unknown'), 'source': actual_source, 'num_parameters': 0 } # Calculate average performance for this dataset avg_performance = np.mean(list(metrics.values())) model_data[model_name]['performances'][dataset_name] = avg_performance # Process Portuguese leaderboard data if show_portuguese_leaderboard and not PORTUGUESE_LEADERBOARD_DATA.empty: for _, row in PORTUGUESE_LEADERBOARD_DATA.iterrows(): model_name = row['model_name'] if model_name not in model_data: model_data[model_name] = { 'performances': {}, 'architecture': 'Unknown', 'source': 'portuguese_leaderboard', 'num_parameters': row.get('model_num_parameters', 0) } # Map Portuguese leaderboard columns to dataset names column_mapping = { 'ASSIN2 RTE': 'assin2_rte', 'ASSIN2 STS': 'assin2_sts', 'FaQUaD-NLI': 'faquad-nli', 'HateBR': 'hatebr' } for display_name, dataset_name in column_mapping.items(): if dataset_name in selected_datasets: score = row[display_name] if pd.notna(score) and score > 0: model_data[model_name]['performances'][dataset_name] = score # Process external models data if show_external_models and not EXTERNAL_MODELS_DATA.empty: for _, row in EXTERNAL_MODELS_DATA.iterrows(): model_name = row['model_name'] if model_name not in model_data: model_data[model_name] = { 'performances': {}, 'architecture': 'Unknown', 'source': 'external_models', 'num_parameters': row.get('model_num_parameters', 0) } # Map external models columns to dataset names column_mapping = { 'ASSIN2 RTE': 'assin2_rte', 'ASSIN2 STS': 'assin2_sts', 'FaQUaD-NLI': 'faquad-nli', 'HateBR': 'hatebr' } for display_name, dataset_name in column_mapping.items(): if dataset_name in selected_datasets: score = row[display_name] if pd.notna(score) and score > 0: model_data[model_name]['performances'][dataset_name] = score # Apply source filtering filtered_model_data = {} for model_name, data in model_data.items(): source = data.get('source', 'existing') # Apply show filters - only show models from sources that are checked if source == 'napolab_thesis' and not show_napolab_thesis: continue if source == 'teenytinyllama_paper' and not show_teenytinyllama: continue if source == 'portuguese_leaderboard' and not show_portuguese_leaderboard: continue if source == 'external_models' and not show_external_models: continue # Hide models with unknown source (should not happen with proper data) if source == 'unknown': continue # Apply parameter filtering (only for Portuguese leaderboard models) if max_num_parameters > 0 and source == 'portuguese_leaderboard': num_parameters = data.get('num_parameters', 0) if num_parameters > max_num_parameters: continue filtered_model_data[model_name] = data # Apply incomplete model filtering if hide_incomplete_models and selected_datasets: final_filtered_data = {} for model_name, data in filtered_model_data.items(): has_all_scores = True for dataset_name in selected_datasets: if data['performances'].get(dataset_name, 0) == 0: has_all_scores = False break if has_all_scores: final_filtered_data[model_name] = data filtered_model_data = final_filtered_data # Apply minimum average performance filtering if min_average_performance > 0 and selected_datasets: final_filtered_data = {} for model_name, data in filtered_model_data.items(): # Calculate average performance for selected datasets scores = [] for dataset_name in selected_datasets: score = data['performances'].get(dataset_name, 0) if score > 0: # Only include non-zero scores scores.append(score) if scores: avg_performance = np.mean(scores) if avg_performance >= min_average_performance: final_filtered_data[model_name] = data filtered_model_data = final_filtered_data # Apply search query filtering if search_query: final_filtered_data = {} try: # Use regex pattern matching import re pattern = re.compile(search_query, re.IGNORECASE) for model_name, data in filtered_model_data.items(): if pattern.search(model_name): final_filtered_data[model_name] = data except re.error: # Fallback to simple string matching if regex is invalid for model_name, data in filtered_model_data.items(): if search_query.lower() in model_name.lower(): final_filtered_data[model_name] = data filtered_model_data = final_filtered_data # Prepare data for scatter plot scatter_data = [] for model_name, data in filtered_model_data.items(): # Calculate average performance for selected datasets scores = [] for dataset_name in selected_datasets: score = data['performances'].get(dataset_name, 0) if score > 0: # Only include non-zero scores scores.append(score) if scores: avg_performance = np.mean(scores) num_parameters = data.get('num_parameters', 0) source = data.get('source', 'unknown') scatter_data.append({ 'model_name': model_name, 'avg_performance': avg_performance, 'num_parameters': num_parameters, 'source': source }) if not scatter_data: # Create empty figure if no data fig = go.Figure() fig.add_annotation( text="No data available for the selected filters", xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False, font=dict(size=16) ) fig.update_layout( title="Model Performance vs Number of Parameters", xaxis_title="Number of Parameters", yaxis_title="Average Performance Score", height=500 ) return fig # Create scatter plot df_scatter = pd.DataFrame(scatter_data) # Create color mapping for sources color_map = { 'portuguese_leaderboard': '#1f77b4', 'external_models': '#ff7f0e', 'napolab_thesis': '#2ca02c', 'teenytinyllama_paper': '#d62728', 'unknown': '#9467bd' } # Create display name mapping for sources display_name_map = { 'portuguese_leaderboard': 'Open PT LLM Leaderboard', 'external_models': 'Proprietary Models', 'napolab_thesis': 'Napolab Thesis', 'teenytinyllama_paper': 'TeenyTinyLlama Paper', 'unknown': 'Unknown Source' } fig = go.Figure() for source in df_scatter['source'].unique(): source_data = df_scatter[df_scatter['source'] == source] color = color_map.get(source, '#7f7f7f') display_name = display_name_map.get(source, source.replace('_', ' ').title()) fig.add_trace(go.Scatter( x=source_data['num_parameters'], y=source_data['avg_performance'], mode='markers', name=display_name, marker=dict( color=color, size=8, opacity=0.7 ), text=source_data['model_name'], hovertemplate=( "%{text}
" + "Average Performance: %{y:.3f}
" + "Number of Parameters: %{x:,}
" + "Source: " + display_name + "
" + "" ) )) fig.update_layout( title="Model Performance vs Number of Parameters", xaxis_title="Number of Parameters", yaxis_title="Average Performance Score", height=500, showlegend=True, plot_bgcolor='rgba(255, 255, 255, 0)', paper_bgcolor='rgba(255, 255, 255, 0)', legend=dict( yanchor="top", y=-0.15, xanchor="center", x=0.5, bgcolor='rgba(255, 255, 255, 0.95)', bordercolor='rgba(0, 0, 0, 0.2)', borderwidth=1, orientation="h" ), margin=dict(l=50, r=50, t=100, b=100) ) return fig # Event handlers def update_radar_chart(*args): # Extract arguments for radar chart dataset_values = args[:len(analysis_dataset_checkboxes)] hide_incomplete_models = args[len(analysis_dataset_checkboxes)] min_average_performance = args[len(analysis_dataset_checkboxes) + 1] / 100.0 # Convert percentage to decimal show_napolab_thesis = args[len(analysis_dataset_checkboxes) + 2] show_teenytinyllama = args[len(analysis_dataset_checkboxes) + 3] show_portuguese_leaderboard = args[len(analysis_dataset_checkboxes) + 4] show_external_models = args[len(analysis_dataset_checkboxes) + 5] search_query = args[len(analysis_dataset_checkboxes) + 6] max_num_parameters = args[len(analysis_dataset_checkboxes) + 7] # Convert dataset selections to list of selected dataset names selected_datasets = [] for i, (dataset_name, _) in enumerate(analysis_dataset_checkboxes): if dataset_values[i]: selected_datasets.append(dataset_name) return create_model_performance_radar(selected_datasets, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, hide_incomplete_models, min_average_performance, search_query, max_num_parameters) def update_benchmark_table(*args): # Extract arguments dataset_values = args[:len(dataset_checkboxes)] hide_incomplete_models = args[len(dataset_checkboxes)] min_average_performance = args[len(dataset_checkboxes) + 1] / 100.0 # Convert percentage to decimal show_napolab_thesis = args[len(dataset_checkboxes) + 2] show_teenytinyllama = args[len(dataset_checkboxes) + 3] show_portuguese_leaderboard = args[len(dataset_checkboxes) + 4] show_external_models = args[len(dataset_checkboxes) + 5] search_query = args[len(dataset_checkboxes) + 6] max_num_parameters = args[len(dataset_checkboxes) + 7] # Convert dataset selections to list of selected dataset names selected_datasets = [] for i, (dataset_name, _) in enumerate(dataset_checkboxes): if dataset_values[i]: selected_datasets.append(dataset_name) df = create_simplified_benchmark_table(selected_datasets, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, hide_incomplete_models, min_average_performance, search_query, max_num_parameters) return df def update_scatter_plot(*args): # Extract arguments for scatter plot dataset_values = args[:len(analysis_dataset_checkboxes)] hide_incomplete_models = args[len(analysis_dataset_checkboxes)] min_average_performance = args[len(analysis_dataset_checkboxes) + 1] / 100.0 # Convert percentage to decimal show_napolab_thesis = args[len(analysis_dataset_checkboxes) + 2] show_teenytinyllama = args[len(analysis_dataset_checkboxes) + 3] show_portuguese_leaderboard = args[len(analysis_dataset_checkboxes) + 4] show_external_models = args[len(analysis_dataset_checkboxes) + 5] search_query = args[len(analysis_dataset_checkboxes) + 6] max_num_parameters = args[len(analysis_dataset_checkboxes) + 7] # Convert dataset selections to list of selected dataset names selected_datasets = [] for i, (dataset_name, _) in enumerate(analysis_dataset_checkboxes): if dataset_values[i]: selected_datasets.append(dataset_name) return create_model_performance_scatter(selected_datasets, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, hide_incomplete_models, min_average_performance, search_query, max_num_parameters) # Connect dataset checkboxes to update table for dataset_name, checkbox in dataset_checkboxes: checkbox.change( update_benchmark_table, inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters], outputs=benchmark_table ) hide_incomplete_models.change( update_benchmark_table, inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters], outputs=benchmark_table ) min_average_performance.change( update_benchmark_table, inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters], outputs=benchmark_table ) show_napolab_thesis.change( update_benchmark_table, inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters], outputs=benchmark_table ) show_teenytinyllama.change( update_benchmark_table, inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters], outputs=benchmark_table ) show_portuguese_leaderboard.change( update_benchmark_table, inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters], outputs=benchmark_table ) show_external_models.change( update_benchmark_table, inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters], outputs=benchmark_table ) # Connect search query to update table search_query.change( update_benchmark_table, inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters], outputs=benchmark_table ) # Connect max_num_parameters to update table max_num_parameters.change( update_benchmark_table, inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters], outputs=benchmark_table ) # Connect export button export_button.click( export_csv, inputs=benchmark_table, outputs=csv_file ) # Connect file download to cleanup csv_file.change( cleanup_current_csv, inputs=None, outputs=None ) # Connect analysis chart events # Connect dataset checkboxes to update radar chart for dataset_name, checkbox in analysis_dataset_checkboxes: checkbox.change( update_radar_chart, inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis], outputs=model_analysis_chart ) hide_incomplete_models_analysis.change( update_radar_chart, inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis], outputs=model_analysis_chart ) min_average_performance_analysis.change( update_radar_chart, inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis], outputs=model_analysis_chart ) show_napolab_thesis_analysis.change( update_radar_chart, inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis], outputs=model_analysis_chart ) show_teenytinyllama_analysis.change( update_radar_chart, inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis], outputs=model_analysis_chart ) show_portuguese_leaderboard_analysis.change( update_radar_chart, inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis], outputs=model_analysis_chart ) show_external_models_analysis.change( update_radar_chart, inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis], outputs=model_analysis_chart ) # Connect search query to update radar chart search_query_analysis.change( update_radar_chart, inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis], outputs=model_analysis_chart ) # Connect max_num_parameters_analysis to update radar chart max_num_parameters_analysis.change( update_radar_chart, inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis], outputs=model_analysis_chart ) # Connect all analysis controls to update scatter plot for dataset_name, checkbox in analysis_dataset_checkboxes: checkbox.change( update_scatter_plot, inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis], outputs=model_scatter_plot ) hide_incomplete_models_analysis.change( update_scatter_plot, inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis], outputs=model_scatter_plot ) min_average_performance_analysis.change( update_scatter_plot, inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis], outputs=model_scatter_plot ) show_napolab_thesis_analysis.change( update_scatter_plot, inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis], outputs=model_scatter_plot ) show_teenytinyllama_analysis.change( update_scatter_plot, inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis], outputs=model_scatter_plot ) show_portuguese_leaderboard_analysis.change( update_scatter_plot, inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis], outputs=model_scatter_plot ) show_external_models_analysis.change( update_scatter_plot, inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis], outputs=model_scatter_plot ) search_query_analysis.change( update_scatter_plot, inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis], outputs=model_scatter_plot ) max_num_parameters_analysis.change( update_scatter_plot, inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis], outputs=model_scatter_plot ) # Connect events # Load model analysis chart on app start app.load(lambda: update_radar_chart(*([display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR'] for _, display_name in [(name, NAPOLAB_DATASETS[name].get('name', name)) for name in sorted(NAPOLAB_DATASETS.keys())]] + [True, 80, True, True, True, True, "", 0])), outputs=model_analysis_chart) # Load scatter plot on app start app.load(lambda: update_scatter_plot(*([display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR'] for _, display_name in [(name, NAPOLAB_DATASETS[name].get('name', name)) for name in sorted(NAPOLAB_DATASETS.keys())]] + [True, 80, True, True, True, True, "", 0])), outputs=model_scatter_plot) # Load benchmark table on app start app.load(lambda: update_benchmark_table(*([display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR'] for _, display_name in [(name, NAPOLAB_DATASETS[name].get('name', name)) for name in sorted(NAPOLAB_DATASETS.keys())]] + [True, 80, True, True, True, True, "", 0])), outputs=benchmark_table) if __name__ == "__main__": app.launch(server_name="0.0.0.0", server_port=7860)