Spaces:

akera
/

leaderboard

Running

App Files Files Community

akera commited on Jun 12

Commit

944a871

verified ·

1 Parent(s): 423834f

Rename src/plotting.py to src/leaderboard.py

Browse files

Files changed (2) hide show

src/leaderboard.py +381 -0
src/plotting.py +0 -296

src/leaderboard.py ADDED Viewed

	@@ -0,0 +1,381 @@

+# src/leaderboard.py
+import pandas as pd
+from datasets import Dataset, load_dataset
+import json
+import datetime
+from typing import Dict, List, Optional, Tuple
+import os
+from config import LEADERBOARD_DATASET, HF_TOKEN, ALL_UG40_LANGUAGES, GOOGLE_SUPPORTED_LANGUAGES
+from src.utils import create_submission_id, sanitize_model_name, get_all_language_pairs, get_google_comparable_pairs
+def initialize_leaderboard() -> pd.DataFrame:
+    """Initialize empty leaderboard DataFrame."""
+    columns = {
+        'submission_id': [],
+        'model_name': [],
+        'author': [],
+        'submission_date': [],
+        'model_type': [],
+        'description': [],
+        # Primary metrics
+        'quality_score': [],
+        'bleu': [],
+        'chrf': [],
+        # Secondary metrics
+        'rouge1': [],
+        'rouge2': [],
+        'rougeL': [],
+        'cer': [],
+        'wer': [],
+        'len_ratio': [],
+        # Google comparable metrics
+        'google_quality_score': [],
+        'google_bleu': [],
+        'google_chrf': [],
+        # Coverage info
+        'total_samples': [],
+        'language_pairs_covered': [],
+        'google_pairs_covered': [],
+        'coverage_rate': [],
+        # Detailed results
+        'detailed_metrics': [],  # JSON string
+        'validation_report': [],
+        # Metadata
+        'evaluation_date': [],
+        'leaderboard_version': []
+    }
+    return pd.DataFrame(columns)
+def load_leaderboard() -> pd.DataFrame:
+    """Load current leaderboard from HuggingFace dataset."""
+    try:
+        print("Loading leaderboard...")
+        dataset = load_dataset(LEADERBOARD_DATASET, split='train')
+        df = dataset.to_pandas()
+        # Ensure all required columns exist
+        required_columns = list(initialize_leaderboard().columns)
+        for col in required_columns:
+            if col not in df.columns:
+                if col in ['quality_score', 'bleu', 'chrf', 'rouge1', 'rouge2', 'rougeL',
+                          'cer', 'wer', 'len_ratio', 'google_quality_score', 'google_bleu',
+                          'google_chrf', 'total_samples', 'language_pairs_covered',
+                          'google_pairs_covered', 'coverage_rate']:
+                    df[col] = 0.0
+                elif col in ['leaderboard_version']:
+                    df[col] = 1
+                else:
+                    df[col] = ''
+        print(f"Loaded leaderboard with {len(df)} entries")
+        return df
+    except Exception as e:
+        print(f"Could not load leaderboard: {e}")
+        print("Initializing empty leaderboard...")
+        return initialize_leaderboard()
+def save_leaderboard(df: pd.DataFrame) -> bool:
+    """Save leaderboard to HuggingFace dataset."""
+    try:
+        # Clean data before saving
+        df_clean = df.copy()
+        # Ensure numeric columns are proper types
+        numeric_columns = ['quality_score', 'bleu', 'chrf', 'rouge1', 'rouge2', 'rougeL',
+                          'cer', 'wer', 'len_ratio', 'google_quality_score', 'google_bleu',
+                          'google_chrf', 'total_samples', 'language_pairs_covered',
+                          'google_pairs_covered', 'coverage_rate', 'leaderboard_version']
+        for col in numeric_columns:
+            if col in df_clean.columns:
+                df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce').fillna(0.0)
+        # Convert to dataset
+        dataset = Dataset.from_pandas(df_clean)
+        # Push to hub
+        dataset.push_to_hub(
+            LEADERBOARD_DATASET,
+            token=HF_TOKEN,
+            commit_message=f"Update leaderboard - {datetime.datetime.now().isoformat()[:19]}"
+        )
+        print("Leaderboard saved successfully!")
+        return True
+    except Exception as e:
+        print(f"Error saving leaderboard: {e}")
+        return False
+def add_model_to_leaderboard(
+    model_name: str,
+    author: str,
+    evaluation_results: Dict,
+    validation_info: Dict,
+    model_type: str = "",
+    description: str = ""
+) -> pd.DataFrame:
+    """Add new model results to leaderboard."""
+    # Load current leaderboard
+    df = load_leaderboard()
+    # Check if model already exists
+    existing_mask = df['model_name'] == model_name
+    if existing_mask.any():
+        print(f"Model '{model_name}' already exists. Updating...")
+        df = df[~existing_mask]  # Remove existing entry
+    # Extract metrics
+    averages = evaluation_results.get('averages', {})
+    google_averages = evaluation_results.get('google_comparable_averages', {})
+    summary = evaluation_results.get('summary', {})
+    # Create new entry
+    new_entry = {
+        'submission_id': create_submission_id(),
+        'model_name': sanitize_model_name(model_name),
+        'author': author[:100] if author else 'Anonymous',
+        'submission_date': datetime.datetime.now().isoformat(),
+        'model_type': model_type[:50] if model_type else 'unknown',
+        'description': description[:500] if description else '',
+        # Primary metrics
+        'quality_score': float(averages.get('quality_score', 0.0)),
+        'bleu': float(averages.get('bleu', 0.0)),
+        'chrf': float(averages.get('chrf', 0.0)),
+        # Secondary metrics
+        'rouge1': float(averages.get('rouge1', 0.0)),
+        'rouge2': float(averages.get('rouge2', 0.0)),
+        'rougeL': float(averages.get('rougeL', 0.0)),
+        'cer': float(averages.get('cer', 0.0)),
+        'wer': float(averages.get('wer', 0.0)),
+        'len_ratio': float(averages.get('len_ratio', 0.0)),
+        # Google comparable metrics
+        'google_quality_score': float(google_averages.get('quality_score', 0.0)),
+        'google_bleu': float(google_averages.get('bleu', 0.0)),
+        'google_chrf': float(google_averages.get('chrf', 0.0)),
+        # Coverage info
+        'total_samples': int(summary.get('total_samples', 0)),
+        'language_pairs_covered': int(summary.get('language_pairs_covered', 0)),
+        'google_pairs_covered': int(summary.get('google_comparable_pairs', 0)),
+        'coverage_rate': float(validation_info.get('coverage', 0.0)),
+        # Detailed results
+        'detailed_metrics': json.dumps(evaluation_results),
+        'validation_report': validation_info.get('report', ''),
+        # Metadata
+        'evaluation_date': datetime.datetime.now().isoformat(),
+        'leaderboard_version': 1
+    }
+    # Add to dataframe
+    new_row_df = pd.DataFrame([new_entry])
+    updated_df = pd.concat([df, new_row_df], ignore_index=True)
+    # Sort by quality score (descending)
+    updated_df = updated_df.sort_values('quality_score', ascending=False).reset_index(drop=True)
+    # Save updated leaderboard
+    if save_leaderboard(updated_df):
+        print(f"Added '{model_name}' to leaderboard")
+        return updated_df
+    else:
+        print("Failed to save leaderboard")
+        return df
+def get_leaderboard_stats(df: pd.DataFrame) -> Dict:
+    """Get summary statistics for the leaderboard."""
+    if df.empty:
+        return {
+            'total_models': 0,
+            'avg_quality_score': 0.0,
+            'best_model': None,
+            'latest_submission': None,
+            'google_comparable_models': 0,
+            'coverage_distribution': {},
+            'language_pair_coverage': {}
+        }
+    # Basic stats
+    stats = {
+        'total_models': len(df),
+        'avg_quality_score': float(df['quality_score'].mean()),
+        'best_model': {
+            'name': df.iloc[0]['model_name'],
+            'score': float(df.iloc[0]['quality_score']),
+            'author': df.iloc[0]['author']
+        } if len(df) > 0 else None,
+        'latest_submission': df['submission_date'].max() if len(df) > 0 else None
+    }
+    # Google comparable models
+    stats['google_comparable_models'] = int((df['google_pairs_covered'] > 0).sum())
+    # Coverage distribution
+    coverage_bins = pd.cut(df['coverage_rate'], bins=[0, 0.5, 0.8, 0.95, 1.0],
+                          labels=['<50%', '50-80%', '80-95%', '95-100%'])
+    stats['coverage_distribution'] = coverage_bins.value_counts().to_dict()
+    # Language pair coverage
+    if len(df) > 0:
+        stats['avg_pairs_covered'] = float(df['language_pairs_covered'].mean())
+        stats['max_pairs_covered'] = int(df['language_pairs_covered'].max())
+        stats['total_possible_pairs'] = len(get_all_language_pairs())
+    return stats
+def filter_leaderboard(
+    df: pd.DataFrame,
+    search_query: str = "",
+    model_type: str = "",
+    min_coverage: float = 0.0,
+    google_comparable_only: bool = False,
+    top_n: int = None
+) -> pd.DataFrame:
+    """Filter leaderboard based on various criteria."""
+    filtered_df = df.copy()
+    # Text search
+    if search_query:
+        query_lower = search_query.lower()
+        mask = (
+            filtered_df['model_name'].str.lower().str.contains(query_lower, na=False) |
+            filtered_df['author'].str.lower().str.contains(query_lower, na=False) |
+            filtered_df['description'].str.lower().str.contains(query_lower, na=False)
+        )
+        filtered_df = filtered_df[mask]
+    # Model type filter
+    if model_type and model_type != "all":
+        filtered_df = filtered_df[filtered_df['model_type'] == model_type]
+    # Coverage filter
+    if min_coverage > 0:
+        filtered_df = filtered_df[filtered_df['coverage_rate'] >= min_coverage]
+    # Google comparable filter
+    if google_comparable_only:
+        filtered_df = filtered_df[filtered_df['google_pairs_covered'] > 0]
+    # Top N filter
+    if top_n:
+        filtered_df = filtered_df.head(top_n)
+    return filtered_df
+def get_model_comparison(df: pd.DataFrame, model_names: List[str]) -> Dict:
+    """Get detailed comparison between specific models."""
+    models = df[df['model_name'].isin(model_names)]
+    if len(models) == 0:
+        return {'error': 'No models found'}
+    comparison = {
+        'models': [],
+        'metrics_comparison': {},
+        'detailed_results': {}
+    }
+    # Extract basic info for each model
+    for _, model in models.iterrows():
+        comparison['models'].append({
+            'name': model['model_name'],
+            'author': model['author'],
+            'submission_date': model['submission_date'],
+            'model_type': model['model_type']
+        })
+        # Parse detailed metrics if available
+        try:
+            detailed = json.loads(model['detailed_metrics'])
+            comparison['detailed_results'][model['model_name']] = detailed
+        except:
+            comparison['detailed_results'][model['model_name']] = {}
+    # Compare metrics
+    metrics = ['quality_score', 'bleu', 'chrf', 'rouge1', 'rougeL', 'cer', 'wer']
+    for metric in metrics:
+        if metric in models.columns:
+            comparison['metrics_comparison'][metric] = {
+                model_name: float(score)
+                for model_name, score in zip(models['model_name'], models[metric])
+            }
+    return comparison
+def export_leaderboard(df: pd.DataFrame, format: str = 'csv', include_detailed: bool = False) -> str:
+    """Export leaderboard in specified format."""
+    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+    # Select columns for export
+    if include_detailed:
+        export_df = df.copy()
+    else:
+        basic_columns = [
+            'model_name', 'author', 'submission_date', 'model_type',
+            'quality_score', 'bleu', 'chrf', 'rouge1', 'rougeL',
+            'total_samples', 'language_pairs_covered', 'coverage_rate'
+        ]
+        export_df = df[basic_columns].copy()
+    if format == 'csv':
+        filename = f"salt_leaderboard_{timestamp}.csv"
+        export_df.to_csv(filename, index=False)
+    elif format == 'json':
+        filename = f"salt_leaderboard_{timestamp}.json"
+        export_df.to_json(filename, orient='records', indent=2)
+    elif format == 'xlsx':
+        filename = f"salt_leaderboard_{timestamp}.xlsx"
+        export_df.to_excel(filename, index=False)
+    else:
+        raise ValueError(f"Unsupported format: {format}")
+    return filename
+def get_ranking_history(df: pd.DataFrame, model_name: str) -> Dict:
+    """Get ranking history for a specific model (if multiple submissions)."""
+    model_entries = df[df['model_name'] == model_name].sort_values('submission_date')
+    if len(model_entries) == 0:
+        return {'error': 'Model not found'}
+    history = []
+    for _, entry in model_entries.iterrows():
+        # Calculate rank at time of submission
+        submission_date = entry['submission_date']
+        historical_df = df[df['submission_date'] <= submission_date]
+        rank = (historical_df['quality_score'] > entry['quality_score']).sum() + 1
+        history.append({
+            'submission_date': submission_date,
+            'quality_score': float(entry['quality_score']),
+            'rank': int(rank),
+            'total_models': len(historical_df)
+        })
+    return {
+        'model_name': model_name,
+        'history': history,
+        'current_rank': history[-1]['rank'] if history else None
+    }

src/plotting.py DELETED Viewed

@@ -1,296 +0,0 @@
-# src/plotting.py
-import matplotlib.pyplot as plt
-import matplotlib.gridspec as gridspec
-import matplotlib.colors as mcolors
-from colorsys import rgb_to_hls, hls_to_rgb
-from collections import defaultdict
-import numpy as np
-import pandas as pd
-from config import LANGUAGE_NAMES
-def create_leaderboard_plot(leaderboard_df: pd.DataFrame, metric: str = 'quality_score') -> plt.Figure:
-    """Create a horizontal bar chart showing model rankings."""
-    fig, ax = plt.subplots(figsize=(12, 8))
-    # Sort by the selected metric (descending)
-    df_sorted = leaderboard_df.sort_values(metric, ascending=True)
-    # Create color palette
-    colors = plt.cm.viridis(np.linspace(0, 1, len(df_sorted)))
-    # Create horizontal bar chart
-    bars = ax.barh(range(len(df_sorted)), df_sorted[metric], color=colors)
-    # Customize the plot
-    ax.set_yticks(range(len(df_sorted)))
-    ax.set_yticklabels(df_sorted['model_display_name'])
-    ax.set_xlabel(f'{metric.replace("_", " ").title()} Score')
-    ax.set_title(f'Model Leaderboard - {metric.replace("_", " ").title()}', fontsize=16, pad=20)
-    # Add value labels on bars
-    for i, (bar, value) in enumerate(zip(bars, df_sorted[metric])):
-        ax.text(value + 0.001, bar.get_y() + bar.get_height()/2,
-                f'{value:.3f}', ha='left', va='center', fontweight='bold')
-    # Add grid for better readability
-    ax.grid(axis='x', linestyle='--', alpha=0.7)
-    ax.set_axisbelow(True)
-    # Set x-axis limits with some padding
-    max_val = df_sorted[metric].max()
-    ax.set_xlim(0, max_val * 1.15)
-    plt.tight_layout()
-    return fig
-def create_detailed_comparison_plot(metrics_data: dict, model_names: list) -> plt.Figure:
-    """Create detailed comparison plot similar to the original evaluation script."""
-    # Filter metrics_data to only include models in model_names
-    filtered_metrics = {name: metrics_data[name] for name in model_names if name in metrics_data}
-    if not filtered_metrics:
-        # Create empty plot if no data
-        fig, ax = plt.subplots(figsize=(10, 6))
-        ax.text(0.5, 0.5, 'No data available for comparison',
-                ha='center', va='center', transform=ax.transAxes, fontsize=16)
-        ax.set_xlim(0, 1)
-        ax.set_ylim(0, 1)
-        ax.axis('off')
-        return fig
-    return plot_translation_metric_comparison(filtered_metrics, metric='bleu')
-def plot_translation_metric_comparison(metrics_by_model: dict, metric: str = 'bleu') -> plt.Figure:
-    """
-    Creates a grouped bar chart comparing a selected metric across translation models.
-    Adapted from the original plotting code.
-    """
-    # Split language pairs into xx_to_eng and eng_to_xx categories
-    first_model_data = list(metrics_by_model.values())[0]
-    xx_to_eng = [key for key in first_model_data.keys()
-                if key.endswith('_to_eng') and key != 'averages']
-    eng_to_xx = [key for key in first_model_data.keys()
-                if key.startswith('eng_to_') and key != 'averages']
-    # Function to create nice labels
-    def format_label(label):
-        if label.startswith("eng_to_"):
-            source, target = "English", label.replace("eng_to_", "")
-            target = LANGUAGE_NAMES.get(target, target)
-        else:
-            source, target = label.replace("_to_eng", ""), "English"
-            source = LANGUAGE_NAMES.get(source, source)
-        return f"{source}→{target}"
-    # Extract metric values for each category
-    def extract_metric_values(model_metrics, pairs, metric_name):
-        return [model_metrics.get(pair, {}).get(metric_name, 0.0) for pair in pairs]
-    xx_to_eng_data = {
-        model_name: extract_metric_values(model_data, xx_to_eng, metric)
-        for model_name, model_data in metrics_by_model.items()
-    }
-    eng_to_xx_data = {
-        model_name: extract_metric_values(model_data, eng_to_xx, metric)
-        for model_name, model_data in metrics_by_model.items()
-    }
-    averages_data = {
-        model_name: [model_data.get("averages", {}).get(metric, 0.0)]
-        for model_name, model_data in metrics_by_model.items()
-    }
-    # Set up plot with custom grid
-    fig = plt.figure(figsize=(18, 12))  # Increased height for better spacing
-    # Create a GridSpec with 1 row and 5 columns
-    gs = gridspec.GridSpec(1, 5)
-    # Colors for the models
-    model_names = list(metrics_by_model.keys())
-    family_base_colors = {
-        'gemma': '#3274A1',
-        'nllb':  '#7f7f7f',
-        'qwen':  '#E1812C',
-        'google': '#3A923A',
-        'other': '#D62728',
-    }
-    # Identify the family for each model
-    def get_family(model_name):
-        model_lower = model_name.lower()
-        if 'gemma' in model_lower:
-            return 'gemma'
-        elif 'qwen' in model_lower:
-            return 'qwen'
-        elif 'nllb' in model_lower:
-            return 'nllb'
-        elif 'google' in model_lower or model_name == 'google-translate':
-            return 'google'
-        else:
-            return 'other'
-    # Count how many models belong to each family
-    family_counts = defaultdict(int)
-    for model in model_names:
-        family = get_family(model)
-        family_counts[family] += 1
-    # Generate slightly varied lightness within each family
-    colors = []
-    family_indices = defaultdict(int)
-    for model in model_names:
-        family = get_family(model)
-        base_rgb = mcolors.to_rgb(family_base_colors[family])
-        h, l, s = rgb_to_hls(*base_rgb)
-        index = family_indices[family]
-        count = family_counts[family]
-        # Vary lightness: from 0.35 to 0.65
-        if count == 1:
-            new_l = l  # Keep original for single models
-        else:
-            new_l = 0.65 - 0.3 * (index / max(count - 1, 1))
-        varied_rgb = hls_to_rgb(h, new_l, s)
-        hex_color = mcolors.to_hex(varied_rgb)
-        colors.append(hex_color)
-        family_indices[family] += 1
-    bar_width = 0.2
-    opacity = 0.8
-    # Positions for the bars
-    xx_to_eng_indices = np.arange(len(xx_to_eng))
-    eng_to_xx_indices = np.arange(len(eng_to_xx))
-    avg_index = np.array([0])
-    # Determine y-axis limits based on metric
-    if metric in ['chrf', 'len_ratio']:
-        y_max = 1.1
-    elif metric in ['cer', 'wer']:
-        y_max = 1.0
-    elif metric == 'bleu':
-        y_max = 65  # Increased from 55 to accommodate high scores
-    elif metric in ['rouge1', 'rouge2', 'rougeL']:
-        y_max = 1.0
-    elif metric == 'quality_score':
-        y_max = 0.65
-    else:
-        # Auto-scale based on data
-        all_values = []
-        for data in [xx_to_eng_data, eng_to_xx_data, averages_data]:
-            for model_data in data.values():
-                all_values.extend(model_data)
-        y_max = max(all_values) * 1.1 if all_values else 1.0
-    # Format metric name for display
-    metric_display = metric.upper() if metric in ['bleu', 'chrf', 'cer', 'wer'] else metric.replace('_', ' ').title()
-    # Create bars for xx_to_eng (using first 2 columns)
-    if xx_to_eng:
-        ax1 = plt.subplot(gs[0, 0:2])
-        for i, (model_name, color) in enumerate(zip(model_names, colors)):
-            if model_name in xx_to_eng_data:
-                ax1.bar(xx_to_eng_indices + i*bar_width, xx_to_eng_data[model_name],
-                        bar_width, alpha=opacity, color=color, label=model_name)
-        ax1.set_xlabel('Translation Direction')
-        ax1.set_ylabel(f'{metric_display} Score')
-        ax1.set_title(f'XX→English {metric_display} Performance')
-        ax1.set_xticks(xx_to_eng_indices + bar_width)
-        ax1.set_xticklabels([format_label(label) for label in xx_to_eng], rotation=45, ha='right')
-        ax1.set_ylim(0, y_max)
-        ax1.grid(axis='y', linestyle='--', alpha=0.7)
-    # Create bars for eng_to_xx (using next 2 columns)
-    if eng_to_xx:
-        ax2 = plt.subplot(gs[0, 2:4])
-        for i, (model_name, color) in enumerate(zip(model_names, colors)):
-            if model_name in eng_to_xx_data:
-                ax2.bar(eng_to_xx_indices + i*bar_width, eng_to_xx_data[model_name],
-                        bar_width, alpha=opacity, color=color, label=model_name)
-        ax2.set_xlabel('Translation Direction')
-        ax2.set_ylabel(f'{metric_display} Score')
-        ax2.set_title(f'English→XX {metric_display} Performance')
-        ax2.set_xticks(eng_to_xx_indices + bar_width)
-        ax2.set_xticklabels([format_label(label) for label in eng_to_xx], rotation=45, ha='right')
-        ax2.set_ylim(0, y_max)
-        ax2.grid(axis='y', linestyle='--', alpha=0.7)
-    # Create bars for averages (using last column)
-    ax3 = plt.subplot(gs[0, 4])
-    for i, (model_name, color) in enumerate(zip(model_names, colors)):
-        if model_name in averages_data:
-            ax3.bar(avg_index + i*bar_width, averages_data[model_name],
-                    bar_width, alpha=opacity, color=color, label=model_name)
-    ax3.set_xlabel('Overall')
-    ax3.set_ylabel(f'{metric_display} Score')
-    ax3.set_title(f'Average {metric_display}')
-    ax3.set_xticks(avg_index + bar_width)
-    ax3.set_xticklabels(['Average'])
-    ax3.set_ylim(0, y_max)
-    ax3.grid(axis='y', linestyle='--', alpha=0.7)
-    ax3.legend()
-    # Add note for metrics where lower is better
-    if metric in ['cer', 'wer']:
-        plt.figtext(0.5, 0.01, "Note: Lower values indicate better performance for this metric",
-                   ha='center', fontsize=12, style='italic')
-    # Add an overall title and adjust layout
-    model_list = ' vs '.join(model_names)
-    plt.suptitle(f'{metric_display} Score Comparison: {model_list}', fontsize=16, y=0.98)
-    plt.tight_layout(rect=[0, 0.02, 1, 0.95])
-    return fig
-def create_summary_metrics_plot(leaderboard_df: pd.DataFrame) -> plt.Figure:
-    """Create a summary plot showing multiple metrics for top models."""
-    if leaderboard_df.empty:
-        fig, ax = plt.subplots(figsize=(10, 6))
-        ax.text(0.5, 0.5, 'No data available', ha='center', va='center',
-                transform=ax.transAxes, fontsize=16)
-        return fig
-    # Select top 5 models by quality score
-    top_models = leaderboard_df.nlargest(5, 'quality_score')
-    # Metrics to display
-    metrics = ['bleu', 'chrf', 'quality_score']
-    metric_labels = ['BLEU', 'ChrF', 'Quality Score']
-    fig, axes = plt.subplots(1, 3, figsize=(15, 6))
-    for i, (metric, label) in enumerate(zip(metrics, metric_labels)):
-        ax = axes[i]
-        # Sort by current metric
-        sorted_models = top_models.sort_values(metric, ascending=True)
-        # Create horizontal bar chart
-        bars = ax.barh(range(len(sorted_models)), sorted_models[metric],
-                      color=plt.cm.viridis(np.linspace(0, 1, len(sorted_models))))
-        ax.set_yticks(range(len(sorted_models)))
-        ax.set_yticklabels(sorted_models['model_display_name'])
-        ax.set_xlabel(f'{label} Score')
-        ax.set_title(f'Top Models - {label}')
-        ax.grid(axis='x', linestyle='--', alpha=0.7)
-        # Add value labels
-        for j, (bar, value) in enumerate(zip(bars, sorted_models[metric])):
-            ax.text(value + value*0.01, bar.get_y() + bar.get_height()/2,
-                    f'{value:.3f}', ha='left', va='center', fontsize=10)
-    plt.tight_layout()
-    return fig