Spaces:

MALIBA-AI
/

bambara-asr-leaderboard

Running

File size: 61,285 Bytes

# import gradio as gr
# import pandas as pd
# from datasets import load_dataset
# from jiwer import wer, cer
# import os
# from datetime import datetime
# import re

# from huggingface_hub import login

# # Login to Hugging Face Hub (if token is available)
# token = os.environ.get("HG_TOKEN")
# if token:
#     login(token)


# try:
#     dataset = load_dataset("sudoping01/bambara-speech-recognition-benchmark", name="default")["eval"]
#     references = {row["id"]: row["text"] for row in dataset}
#     print(f"Loaded {len(references)} reference transcriptions")
# except Exception as e:
#     print(f"Error loading dataset: {str(e)}")
#     references = {}


# leaderboard_file = "leaderboard.csv"
# if not os.path.exists(leaderboard_file):

#     sample_data = [
#           ["test_1", 0.2264, 0.1094, 0.1922, "2025-03-15 10:30:45"],
#          ["test_2", 0.3264, 0.1094, 0.1922, "2025-03-15 10:30:45"],
#         ]
#     pd.DataFrame(sample_data, 
#                  columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"]).to_csv(leaderboard_file, index=False)
#     print(f"Created new leaderboard file with sample data")
# else:
#     leaderboard_df = pd.read_csv(leaderboard_file)
    

#     if "Combined_Score" not in leaderboard_df.columns:
#         leaderboard_df["Combined_Score"] = leaderboard_df["WER"] * 0.7 + leaderboard_df["CER"] * 0.3
#         leaderboard_df.to_csv(leaderboard_file, index=False)
#         print(f"Added Combined_Score column to existing leaderboard")
#     print(f"Loaded leaderboard with {len(leaderboard_df)} entries")

# def normalize_text(text):
#     """Normalize text for WER/CER calculation"""
#     if not isinstance(text, str):
#         text = str(text)
    
#     text = text.lower()
#     text = re.sub(r'[^\w\s]', '', text)
#     text = re.sub(r'\s+', ' ', text).strip()
#     return text

# def calculate_metrics(predictions_df):
#     """Calculate WER and CER for predictions."""
#     results = []
#     total_ref_words = 0
#     total_ref_chars = 0

#     for _, row in predictions_df.iterrows():
#         id_val = row["id"]
#         if id_val not in references:
#             continue
            
#         reference = normalize_text(references[id_val])
#         hypothesis = normalize_text(row["text"])
        
#         if not reference or not hypothesis:
#             continue
            
#         reference_words = reference.split()
#         hypothesis_words = hypothesis.split()
#         reference_chars = list(reference)
        
#         try:
#             sample_wer = wer(reference, hypothesis)
#             sample_cer = cer(reference, hypothesis)
            
#             sample_wer = min(sample_wer, 2.0)  
#             sample_cer = min(sample_cer, 2.0)  
            
#             total_ref_words += len(reference_words)
#             total_ref_chars += len(reference_chars)
            
#             results.append({
#                 "id": id_val,
#                 "reference": reference,
#                 "hypothesis": hypothesis,
#                 "ref_word_count": len(reference_words),
#                 "ref_char_count": len(reference_chars),
#                 "wer": sample_wer,
#                 "cer": sample_cer
#             })
#         except Exception as e:
#             print(f"Error processing sample {id_val}: {str(e)}")
#             pass
    
#     if not results:
#         raise ValueError("No valid samples for WER/CER calculation")
        
#     avg_wer = sum(item["wer"] for item in results) / len(results)
#     avg_cer = sum(item["cer"] for item in results) / len(results)
    

#     weighted_wer = sum(item["wer"] * item["ref_word_count"] for item in results) / total_ref_words
#     weighted_cer = sum(item["cer"] * item["ref_char_count"] for item in results) / total_ref_chars
    
#     return avg_wer, avg_cer, weighted_wer, weighted_cer, results

# def format_as_percentage(value):
#     """Convert decimal to percentage with 2 decimal places"""
#     return f"{value * 100:.2f}%"

# def prepare_leaderboard_for_display(df, sort_by="Combined_Score"):
#     """Format leaderboard for display with ranking and percentages"""
#     if df is None or len(df) == 0:
#         return pd.DataFrame(columns=["Rank", "Model_Name", "WER (%)", "CER (%)", "Combined_Score (%)", "timestamp"])
    

#     display_df = df.copy()
    

#     display_df = display_df.sort_values(sort_by)
    
#     display_df.insert(0, "Rank", range(1, len(display_df) + 1))
    
#     for col in ["WER", "CER", "Combined_Score"]:
#         if col in display_df.columns:
#             display_df[f"{col} (%)"] = display_df[col].apply(lambda x: f"{x * 100:.2f}")
    

    
#     return display_df

# def update_ranking(method):
#     """Update leaderboard ranking based on selected method"""
#     try:
#         current_lb = pd.read_csv(leaderboard_file)
        
#         if "Combined_Score" not in current_lb.columns:
#             current_lb["Combined_Score"] = current_lb["WER"] * 0.7 + current_lb["CER"] * 0.3
        
#         sort_column = "Combined_Score"
#         if method == "WER Only":
#             sort_column = "WER"
#         elif method == "CER Only":
#             sort_column = "CER"
        
#         return prepare_leaderboard_for_display(current_lb, sort_column)
        
#     except Exception as e:
#         print(f"Error updating ranking: {str(e)}")
#         return pd.DataFrame(columns=["Rank", "Model_Name", "WER (%)", "CER (%)", "Combined_Score (%)", "timestamp"])

# def process_submission(model_name, csv_file):
#     """Process a new model submission"""
#     if not model_name or not model_name.strip():
#         return "Error: Please provide a model name.", None
        
#     if not csv_file:
#         return "Error: Please upload a CSV file.", None
    
#     try:
#         df = pd.read_csv(csv_file)
        
#         if len(df) == 0:
#             return "Error: Uploaded CSV is empty.", None
            
#         if set(df.columns) != {"id", "text"}:
#             return f"Error: CSV must contain exactly 'id' and 'text' columns. Found: {', '.join(df.columns)}", None
            
#         if df["id"].duplicated().any():
#             dup_ids = df[df["id"].duplicated()]["id"].unique()
#             return f"Error: Duplicate IDs found: {', '.join(map(str, dup_ids[:5]))}", None

#         missing_ids = set(references.keys()) - set(df["id"])
#         extra_ids = set(df["id"]) - set(references.keys())
        
#         if missing_ids:
#             return f"Error: Missing {len(missing_ids)} IDs in submission. First few missing: {', '.join(map(str, list(missing_ids)[:5]))}", None
            
#         if extra_ids:
#             return f"Error: Found {len(extra_ids)} extra IDs not in reference dataset. First few extra: {', '.join(map(str, list(extra_ids)[:5]))}", None
        
#         try:
#             avg_wer, avg_cer, weighted_wer, weighted_cer, detailed_results = calculate_metrics(df)
            
#             # Check for suspiciously low values
#             if avg_wer < 0.001:
#                 return "Error: WER calculation yielded suspicious results (near-zero). Please check your submission CSV.", None
                
#         except Exception as e:
#             return f"Error calculating metrics: {str(e)}", None
        

#         leaderboard = pd.read_csv(leaderboard_file)
#         timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        
#         combined_score = avg_wer * 0.7 + avg_cer * 0.3
        
#         if model_name in leaderboard["Model_Name"].values:
#             idx = leaderboard[leaderboard["Model_Name"] == model_name].index
#             leaderboard.loc[idx, "WER"] = avg_wer
#             leaderboard.loc[idx, "CER"] = avg_cer
#             leaderboard.loc[idx, "Combined_Score"] = combined_score
#             leaderboard.loc[idx, "timestamp"] = timestamp
#             updated_leaderboard = leaderboard
#         else:
#             new_entry = pd.DataFrame(
#                 [[model_name, avg_wer, avg_cer, combined_score, timestamp]],
#                 columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"]
#             )
#             updated_leaderboard = pd.concat([leaderboard, new_entry])
        
#         updated_leaderboard = updated_leaderboard.sort_values("Combined_Score")
#         updated_leaderboard.to_csv(leaderboard_file, index=False)
        
#         display_leaderboard = prepare_leaderboard_for_display(updated_leaderboard)
        
#         return f"Submission processed successfully! WER: {format_as_percentage(avg_wer)}, CER: {format_as_percentage(avg_cer)}, Combined Score: {format_as_percentage(combined_score)}", display_leaderboard
        
#     except Exception as e:
#         return f"Error processing submission: {str(e)}", None

# def get_current_leaderboard():
#     """Get the current leaderboard data for display"""
#     try:
#         if os.path.exists(leaderboard_file):
#             current_leaderboard = pd.read_csv(leaderboard_file)
            
#             if "Combined_Score" not in current_leaderboard.columns:
#                 current_leaderboard["Combined_Score"] = current_leaderboard["WER"] * 0.7 + current_leaderboard["CER"] * 0.3
#                 current_leaderboard.to_csv(leaderboard_file, index=False)
                
#             return current_leaderboard
#         else:
#             return pd.DataFrame(columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"])
#     except Exception as e:
#         print(f"Error getting leaderboard: {str(e)}")
#         return pd.DataFrame(columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"])

# def create_leaderboard_table():
#     """Create and format the leaderboard table for display"""
#     leaderboard_data = get_current_leaderboard()
#     return prepare_leaderboard_for_display(leaderboard_data)

# with gr.Blocks(title="Bambara ASR Leaderboard") as demo:
#     gr.Markdown(
#         """
#         # 🇲🇱 Bambara ASR Leaderboard
        
#         This leaderboard tracks and evaluates speech recognition models for the Bambara language.
#         Models are ranked based on Word Error Rate (WER), Character Error Rate (CER), and a combined score.
        
#         ## Current Models Performance
#         """
#     )
    
#     current_data = get_current_leaderboard()
    

#     if len(current_data) > 0:
#         best_model = current_data.sort_values("Combined_Score").iloc[0]
#         gr.Markdown(f"""
#         ### 🏆 Current Best Model: **{best_model['Model_Name']}**
#         * WER: **{best_model['WER']*100:.2f}%**
#         * CER: **{best_model['CER']*100:.2f}%**
#         * Combined Score: **{best_model['Combined_Score']*100:.2f}%**
#         """)
    
#     with gr.Tabs() as tabs:
#         with gr.TabItem("🏅 Model Rankings"):

#             initial_leaderboard = create_leaderboard_table()
            
#             ranking_method = gr.Radio(
#                 ["Combined Score (WER 70%, CER 30%)", "WER Only", "CER Only"], 
#                 label="Ranking Method",
#                 value="Combined Score (WER 70%, CER 30%)"
#             )
            
#             leaderboard_view = gr.DataFrame(
#                 value=initial_leaderboard,
#                 interactive=False,
#                 label="Models are ranked by selected metric - lower is better"
#             )
            
#             ranking_method.change(
#                 fn=update_ranking,
#                 inputs=[ranking_method],
#                 outputs=[leaderboard_view]
#             )
            
#             with gr.Accordion("Metrics Explanation", open=False):
#                 gr.Markdown(
#                     """
#                     ## Understanding ASR Metrics
                    
#                     ### Word Error Rate (WER)
#                     WER measures how accurately the ASR system recognizes whole words:
#                     * Lower values indicate better performance
#                     * Calculated as: (Substitutions + Insertions + Deletions) / Total Words
#                     * A WER of 0% means perfect transcription
#                     * A WER of 20% means approximately 1 in 5 words contains an error
                    
#                     ### Character Error Rate (CER)
#                     CER measures accuracy at the character level:
#                     * More fine-grained than WER
#                     * Better at capturing partial word matches
#                     * Particularly useful for agglutinative languages like Bambara
                    
#                     ### Combined Score
#                     * Weighted average: 70% WER + 30% CER
#                     * Provides a balanced evaluation of model performance
#                     * Used as the primary ranking metric
#                     """
#                 )
        
#         with gr.TabItem("📊 Submit New Results"):
#             gr.Markdown(
#                 """
#                 ### Submit a new model for evaluation
                
#                 Upload a CSV file with the following format:
#                 * Must contain exactly two columns: 'id' and 'text'
#                 * The 'id' column should match the reference dataset IDs
#                 * The 'text' column should contain your model's transcriptions
#                 """
#             )
            
#             with gr.Row():
#                 model_name_input = gr.Textbox(
#                     label="Model Name", 
#                     placeholder="e.g., MALIBA-AI/bambara-asr"
#                 )
#                 gr.Markdown("*Use a descriptive name to identify your model*")
            
#             with gr.Row():
#                 csv_upload = gr.File(
#                     label="Upload CSV File", 
#                     file_types=[".csv"]
#                 )
#                 gr.Markdown("*CSV with columns: id, text*")
                
#             submit_btn = gr.Button("Submit", variant="primary")
#             output_msg = gr.Textbox(label="Status", interactive=False)
#             leaderboard_display = gr.DataFrame(
#                 label="Updated Leaderboard",
#                 value=initial_leaderboard,
#                 interactive=False
#             )
            
#             submit_btn.click(
#                 fn=process_submission,
#                 inputs=[model_name_input, csv_upload],
#                 outputs=[output_msg, leaderboard_display]
#             )
            
#         with gr.TabItem("📝 Benchmark Dataset"):
#             gr.Markdown(
#                 """
#                 ## About the Benchmark Dataset
                
#                 This leaderboard uses the **[sudoping01/bambara-speech-recognition-benchmark](https://huggingface.co/datasets/MALIBA-AI/bambara-speech-recognition-leaderboard)** dataset:
                
#                 * Contains diverse Bambara speech samples
#                 * Includes various speakers, accents, and dialects
#                 * Covers different speech styles and recording conditions
#                 * Transcribed and validated
                
#                 ### How to Generate Predictions
                
#                 To submit results to this leaderboard:
                
#                 1. Download the audio files from the benchmark dataset
#                 2. Run your ASR model on the audio files
#                 3. Generate a CSV file with 'id' and 'text' columns
#                 4. Submit your results using the form in the "Submit New Results" tab
                
#                 ### Evaluation Guidelines
                
#                 * Text is normalized (lowercase, punctuation removed) before metrics calculation
#                 * Extreme outliers are capped to prevent skewing results
#                 * All submissions are validated for format and completeness

#                 NB: This work is a collaboration between MALIBA-AI, RobotsMali AI4D-LAB and Djelia
#                 """
#             )
            
#     gr.Markdown(
#         """
#         ---
#         ### About MALIBA-AI
        
#         **MALIBA-AI: Empowering Mali's Future Through Community-Driven AI Innovation**
        
#         *"No Malian Language Left Behind"*
        
#         This leaderboard is maintained by the MALIBA-AI initiative to track progress in Bambara speech recognition technology.
#         For more information, visit [MALIBA-AI on Hugging Face](https://huggingface.co/MALIBA-AI).
#         """
#     )

# if __name__ == "__main__":
#     demo.launch()


import gradio as gr
import pandas as pd
from datasets import load_dataset
from jiwer import wer, cer
import os
from datetime import datetime
import re
import plotly.express as px
import plotly.graph_objects as go
from huggingface_hub import login
import numpy as np

# Custom CSS inspired by Sahara leaderboard
custom_head_html = """
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&family=Rubik:wght@400;600&display=swap" rel="stylesheet">
"""

# Header with MALIBA-AI branding
new_header_html = """
<center>
    <br><br>
    <div style="display: flex; align-items: center; justify-content: center; gap: 20px; margin-bottom: 20px;">
        <div style="font-size: 4em;">🇲🇱</div>
        <div>
            <h1 style="margin: 0; font-family: 'Rubik', sans-serif; color: #2f3b7d; font-size: 2.5em; font-weight: 700;">
                Bambara ASR Leaderboard
            </h1>
            <p style="margin: 5px 0 0 0; font-size: 1.2em; color: #7d3561; font-weight: 600;">
                Powered by MALIBA-AI • "No Malian Language Left Behind"
            </p>
        </div>
        <div style="font-size: 4em;">🎙️</div>
    </div>
</center>
"""

# Advanced CSS styling inspired by Sahara
sahara_style_css = """
/* Global Styles */
div[class*="gradio-container"] {
    background: #FFFBF5 !important;
    color: #000 !important;
    font-family: 'Inter', sans-serif !important;
}

div.svelte-1nguped {
    background: white !important;
}

.fillable.svelte-15jxnnn.svelte-15jxnnn:not(.fill_width) {
    max-width: 1580px !important;
}

/* Navigation Buttons */
.nav-button {
    background-color: #117b75 !important;
    color: #fff !important;
    font-weight: bold !important;
    border-radius: 8px !important;
    border: none !important;
    box-shadow: 0 2px 4px rgba(0,0,0,0.1) !important;
    transition: all 0.3s ease !important;
}

.nav-button:hover {
    background-color: #0f6b66 !important;
    color: #e8850e !important;
    transform: translateY(-1px) !important;
    box-shadow: 0 4px 8px rgba(0,0,0,0.2) !important;
}

/* Content Cards */
.content-section {
    padding: 40px 0;
}

.content-card {
    background-color: #fff !important;
    border-radius: 16px !important;
    box-shadow: 0 10px 25px -5px rgba(0,0,0,0.1), 0 8px 10px -6px rgba(0,0,0,0.1) !important;
    padding: 40px !important;
    margin-bottom: 30px !important;
    border: 1px solid rgba(0,0,0,0.05) !important;
}

/* Typography */
.content-card h2 {
    font-family: "Rubik", sans-serif !important;
    font-size: 32px !important;
    font-weight: 700 !important;
    line-height: 1.25 !important;
    letter-spacing: -1px !important;
    color: #2f3b7d !important;
    margin-bottom: 20px !important;
    text-align: center !important;
}

.content-card h3 {
    font-size: 22px !important;
    color: #2f3b7d !important;
    font-weight: 600 !important;
    margin-bottom: 15px !important;
}

.content-card h4 {
    font-family: "Rubik", sans-serif !important;
    color: #7d3561 !important;
    font-weight: 600 !important;
    margin-bottom: 10px !important;
}

.title {
    color: #7d3561 !important;
    font-weight: 600 !important;
}

/* Tab Styling */
.tab-wrapper.svelte-1tcem6n.svelte-1tcem6n {
    display: flex;
    align-items: center;
    justify-content: space-between;
    position: relative;
    height: auto !important;
    padding-bottom: 0 !important;
}

.selected.svelte-1tcem6n.svelte-1tcem6n {
    background-color: #7d3561 !important;
    color: #fff !important;
    border-radius: 8px 8px 0 0 !important;
}

button.svelte-1tcem6n.svelte-1tcem6n {
    color: #7d3561 !important;
    font-weight: 600 !important;
    font-size: 16px !important;
    padding: 12px 20px !important;
    background-color: #fff !important;
    border-radius: 8px 8px 0 0 !important;
    border: 2px solid #e9ecef !important;
    border-bottom: none !important;
    transition: all 0.3s ease !important;
}

button.svelte-1tcem6n.svelte-1tcem6n:hover {
    background-color: #f8f9fa !important;
    border-color: #7d3561 !important;
}

.tab-container.svelte-1tcem6n.svelte-1tcem6n:after {
    content: "";
    position: absolute;
    bottom: 0;
    left: 0;
    right: 0;
    height: 3px;
    background: linear-gradient(90deg, #7d3561 0%, #2f3b7d 100%) !important;
}

/* Table Styling */
div[class*="gradio-container"] .prose table {
    color: #000 !important;
    border: 2px solid #dca02a !important;
    border-radius: 12px !important;
    margin-bottom: 20px !important;
    margin-left: auto !important;
    margin-right: auto !important;
    width: 100% !important;
    border-collapse: separate !important;
    border-spacing: 0 !important;
    overflow: hidden !important;
    box-shadow: 0 4px 6px rgba(0,0,0,0.1) !important;
}

div[class*="gradio-container"] .prose thead tr {
    background: linear-gradient(90deg, #7d3561 0%, #2f3b7d 100%) !important;
}

div[class*="gradio-container"] .prose th {
    color: #fff !important;
    font-weight: 700 !important;
    font-size: 14px !important;
    padding: 15px 10px !important;
    text-align: center !important;
    border: none !important;
}

div[class*="gradio-container"] .prose td {
    font-size: 14px !important;
    padding: 12px 10px !important;
    border: none !important;
    text-align: center !important;
    color: #000 !important;
    border-bottom: 1px solid #f8f9fa !important;
}

div[class*="gradio-container"] .prose tbody tr:nth-child(even) {
    background-color: #f8f9fa !important;
}

div[class*="gradio-container"] .prose tbody tr:hover {
    background-color: #e3f2fd !important;
    transition: background-color 0.2s ease !important;
}

/* First column (model names) styling */
div[class*="gradio-container"] .prose th:first-child,
div[class*="gradio-container"] .prose td:first-child {
    text-align: left !important;
    min-width: 250px !important;
    font-weight: 600 !important;
}

/* Performance badges */
.performance-badge {
    display: inline-block;
    padding: 4px 8px;
    border-radius: 12px;
    font-size: 12px;
    font-weight: 600;
    margin-left: 8px;
}

.badge-excellent {
    background: #d4edda;
    color: #155724;
}

.badge-good {
    background: #fff3cd;
    color: #856404;
}

.badge-fair {
    background: #f8d7da;
    color: #721c24;
}

/* Stats cards */
.stats-grid {
    display: grid;
    grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
    gap: 20px;
    margin: 20px 0;
}

.stat-card {
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
    color: white;
    padding: 20px;
    border-radius: 12px;
    text-align: center;
    box-shadow: 0 4px 6px rgba(0,0,0,0.1);
}

.stat-number {
    font-size: 2em;
    font-weight: 700;
    margin-bottom: 5px;
}

.stat-label {
    font-size: 0.9em;
    opacity: 0.9;
}

/* Form styling */
.form-section {
    background: #f8f9fa;
    border-radius: 12px;
    padding: 25px;
    margin: 20px 0;
    border-left: 4px solid #7d3561;
}

/* Citation block */
.citation-block {
    background-color: #FDF6E3 !important;
    border-radius: 12px !important;
    padding: 25px !important;
    border-left: 4px solid #D97706 !important;
    margin: 20px 0 !important;
}

/* Dropdown styling */
.gradio-dropdown {
    border-radius: 8px !important;
    border: 2px solid #e9ecef !important;
}

.gradio-dropdown:focus {
    border-color: #7d3561 !important;
    box-shadow: 0 0 0 3px rgba(125, 53, 97, 0.1) !important;
}

/* Button styling */
.gradio-button {
    border-radius: 8px !important;
    font-weight: 600 !important;
    transition: all 0.3s ease !important;
}

.gradio-button.primary {
    background: linear-gradient(135deg, #7d3561 0%, #2f3b7d 100%) !important;
    border: none !important;
    color: white !important;
}

.gradio-button.primary:hover {
    transform: translateY(-2px) !important;
    box-shadow: 0 4px 12px rgba(125, 53, 97, 0.3) !important;
}

/* Responsive design */
@media (max-width: 768px) {
    .content-card {
        padding: 20px !important;
        margin-bottom: 20px !important;
    }
    
    .content-card h2 {
        font-size: 24px !important;
    }
    
    .stats-grid {
        grid-template-columns: 1fr !important;
    }
}
"""

# Login to Hugging Face Hub (if token is available)
token = os.environ.get("HG_TOKEN")
if token:
    login(token)

# Load dataset
try:
    dataset = load_dataset("sudoping01/bambara-speech-recognition-benchmark", name="default")["eval"]
    references = {row["id"]: row["text"] for row in dataset}
    print(f"Loaded {len(references)} reference transcriptions")
except Exception as e:
    print(f"Error loading dataset: {str(e)}")
    references = {}

# Initialize leaderboard
leaderboard_file = "leaderboard.csv"
if not os.path.exists(leaderboard_file):
    sample_data = [
        ["MALIBA-AI/bambara-whisper-small", 0.2264, 0.1094, 0.1922, "2025-03-15 10:30:45", "Whisper-based", "Mali", "ASR"],
        ["OpenAI/whisper-base", 0.3264, 0.1094, 0.1922, "2025-03-15 10:30:45", "Foundation", "USA", "ASR"],
    ]
    pd.DataFrame(sample_data, 
                 columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp", "Type", "Origin", "Task"]).to_csv(leaderboard_file, index=False)
    print(f"Created new leaderboard file with sample data")
else:
    leaderboard_df = pd.read_csv(leaderboard_file)
    
    # Add new columns if they don't exist
    required_columns = ["Combined_Score", "Type", "Origin", "Task"]
    for col in required_columns:
        if col not in leaderboard_df.columns:
            if col == "Combined_Score":
                leaderboard_df[col] = leaderboard_df["WER"] * 0.7 + leaderboard_df["CER"] * 0.3
            else:
                default_val = "Unknown" if col != "Task" else "ASR"
                leaderboard_df[col] = default_val
    
    leaderboard_df.to_csv(leaderboard_file, index=False)
    print(f"Loaded leaderboard with {len(leaderboard_df)} entries")

def normalize_text(text):
    """Normalize text for WER/CER calculation"""
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def calculate_metrics(predictions_df):
    """Calculate WER and CER for predictions."""
    results = []
    total_ref_words = 0
    total_ref_chars = 0

    for _, row in predictions_df.iterrows():
        id_val = row["id"]
        if id_val not in references:
            continue
            
        reference = normalize_text(references[id_val])
        hypothesis = normalize_text(row["text"])
        
        if not reference or not hypothesis:
            continue
            
        reference_words = reference.split()
        hypothesis_words = hypothesis.split()
        reference_chars = list(reference)
        
        try:
            sample_wer = wer(reference, hypothesis)
            sample_cer = cer(reference, hypothesis)
            
            sample_wer = min(sample_wer, 2.0)  
            sample_cer = min(sample_cer, 2.0)  
            
            total_ref_words += len(reference_words)
            total_ref_chars += len(reference_chars)
            
            results.append({
                "id": id_val,
                "reference": reference,
                "hypothesis": hypothesis,
                "ref_word_count": len(reference_words),
                "ref_char_count": len(reference_chars),
                "wer": sample_wer,
                "cer": sample_cer
            })
        except Exception as e:
            print(f"Error processing sample {id_val}: {str(e)}")
            pass
    
    if not results:
        raise ValueError("No valid samples for WER/CER calculation")
        
    avg_wer = sum(item["wer"] for item in results) / len(results)
    avg_cer = sum(item["cer"] for item in results) / len(results)
    
    weighted_wer = sum(item["wer"] * item["ref_word_count"] for item in results) / total_ref_words
    weighted_cer = sum(item["cer"] * item["ref_char_count"] for item in results) / total_ref_chars
    
    return avg_wer, avg_cer, weighted_wer, weighted_cer, results

def format_as_percentage(value):
    """Convert decimal to percentage with 2 decimal places"""
    return f"{value * 100:.2f}%"

def get_performance_badge(score):
    """Get performance badge based on score"""
    if score < 0.15:
        return "🏆 Excellent"
    elif score < 0.30:
        return "🥉 Good"
    else:
        return "📈 Fair"

def add_medals_to_models(df, score_col="Combined_Score"):
    """Add medals to top-performing models"""
    if df.empty or score_col not in df.columns:
        return df
    
    df_copy = df.copy()
    
    # Convert score to float for sorting
    df_copy[f"{score_col}_float"] = pd.to_numeric(df_copy[score_col], errors='coerce')
    
    # Sort by score (ascending - lower is better for error rates)
    df_copy = df_copy.sort_values(by=f"{score_col}_float", ascending=True, na_position='last').reset_index(drop=True)
    
    # Get unique scores for ranking
    valid_scores = df_copy[f"{score_col}_float"].dropna().unique()
    valid_scores.sort()
    
    # Assign medals
    medals = ["🏆", "🥈", "🥉"]
    
    def get_medal(score):
        if pd.isna(score):
            return ""
        rank = np.where(valid_scores == score)[0]
        if len(rank) > 0 and rank[0] < len(medals):
            return medals[rank[0]] + " "
        return ""
    
    df_copy["Medal"] = df_copy[f"{score_col}_float"].apply(get_medal)
    df_copy["Model_Name"] = df_copy["Medal"] + df_copy["Model_Name"].astype(str)
    
    # Clean up temporary columns
    df_copy = df_copy.drop(columns=[f"{score_col}_float", "Medal"])
    
    return df_copy

def prepare_leaderboard_for_display(df, sort_by="Combined_Score"):
    """Format leaderboard for display with ranking and percentages"""
    if df is None or len(df) == 0:
        return pd.DataFrame(columns=["Rank", "Model", "WER (%)", "CER (%)", "Combined Score (%)", "Performance", "Type", "Date"])
    
    display_df = df.copy()
    
    # Add medals first
    display_df = add_medals_to_models(display_df, sort_by)
    
    # Sort by the specified column
    display_df[f"{sort_by}_float"] = pd.to_numeric(display_df[sort_by], errors='coerce')
    display_df = display_df.sort_values(f"{sort_by}_float", ascending=True, na_position='last')
    
    # Add rank
    display_df.insert(0, "Rank", range(1, len(display_df) + 1))
    
    # Format percentages
    for col in ["WER", "CER", "Combined_Score"]:
        if col in display_df.columns:
            display_df[f"{col} (%)"] = display_df[col].apply(lambda x: f"{x * 100:.2f}" if pd.notna(x) else "---")
    
    # Add performance badges
    display_df["Performance"] = display_df["Combined_Score"].apply(lambda x: get_performance_badge(x) if pd.notna(x) else "---")
    
    # Shorten model names for display
    display_df["Model"] = display_df["Model_Name"].apply(lambda x: x.split("/")[-1] if "/" in str(x) else str(x))
    
    # Format date
    if "timestamp" in display_df.columns:
        display_df["Date"] = pd.to_datetime(display_df["timestamp"], errors='coerce').dt.strftime("%Y-%m-%d")
    else:
        display_df["Date"] = "---"
    
    # Select and reorder columns
    display_columns = ["Rank", "Model", "WER (%)", "CER (%)", "Combined Score (%)", "Performance", "Type", "Date"]
    available_columns = [col for col in display_columns if col in display_df.columns]
    
    # Clean up temporary columns
    temp_cols = [col for col in display_df.columns if col.endswith("_float")]
    display_df = display_df.drop(columns=temp_cols, errors='ignore')
    
    return display_df[available_columns]

def create_performance_chart():
    """Create performance visualization chart"""
    try:
        df = pd.read_csv(leaderboard_file)
        if len(df) == 0:
            return None
            
        # Sort by Combined_Score
        df = df.sort_values("Combined_Score")
        
        fig = go.Figure()
        
        # Add WER bars
        fig.add_trace(go.Bar(
            name="WER",
            x=df["Model_Name"].apply(lambda x: x.split("/")[-1] if "/" in x else x),
            y=df["WER"] * 100,
            marker_color='#ff7f0e',
            hovertemplate='<b>%{x}</b><br>WER: %{y:.2f}%<extra></extra>'
        ))
        
        # Add CER bars
        fig.add_trace(go.Bar(
            name="CER", 
            x=df["Model_Name"].apply(lambda x: x.split("/")[-1] if "/" in x else x),
            y=df["CER"] * 100,
            marker_color='#2ca02c',
            hovertemplate='<b>%{x}</b><br>CER: %{y:.2f}%<extra></extra>'
        ))
        
        # Add Combined Score line
        fig.add_trace(go.Scatter(
            name="Combined Score",
            x=df["Model_Name"].apply(lambda x: x.split("/")[-1] if "/" in x else x),
            y=df["Combined_Score"] * 100,
            mode='lines+markers',
            line=dict(color='#d62728', width=3),
            marker=dict(size=8),
            hovertemplate='<b>%{x}</b><br>Combined Score: %{y:.2f}%<extra></extra>'
        ))
        
        fig.update_layout(
            title={
                'text': "📊 Model Performance Comparison",
                'x': 0.5,
                'font': {'size': 18, 'family': 'Rubik'}
            },
            xaxis_title="Model",
            yaxis_title="Error Rate (%)",
            hovermode='x unified',
            height=500,
            showlegend=True,
            plot_bgcolor='rgba(0,0,0,0)',
            paper_bgcolor='rgba(0,0,0,0)',
            font=dict(family="Inter", size=12),
            legend=dict(
                orientation="h",
                yanchor="bottom",
                y=1.02,
                xanchor="right",
                x=1
            )
        )
        
        return fig
    except Exception as e:
        print(f"Error creating chart: {str(e)}")
        return None

def get_leaderboard_stats():
    """Get summary statistics for the leaderboard"""
    try:
        df = pd.read_csv(leaderboard_file)
        if len(df) == 0:
            return """
            <div class="stats-grid">
                <div class="stat-card">
                    <div class="stat-number">0</div>
                    <div class="stat-label">Models Submitted</div>
                </div>
            </div>
            """
        
        best_model = df.loc[df["Combined_Score"].idxmin()]
        total_models = len(df)
        avg_wer = df["WER"].mean()
        avg_cer = df["CER"].mean()
        
        return f"""
        <div class="stats-grid">
            <div class="stat-card">
                <div class="stat-number">{total_models}</div>
                <div class="stat-label">Models Evaluated</div>
            </div>
            <div class="stat-card">
                <div class="stat-number">{format_as_percentage(best_model['Combined_Score'])}</div>
                <div class="stat-label">Best Combined Score</div>
            </div>
            <div class="stat-card">
                <div class="stat-number">{format_as_percentage(avg_wer)}</div>
                <div class="stat-label">Average WER</div>
            </div>
            <div class="stat-card">
                <div class="stat-number">{format_as_percentage(avg_cer)}</div>
                <div class="stat-label">Average CER</div>
            </div>
        </div>
        
        <div style="text-align: center; margin-top: 20px;">
            <h4>🏆 Current Champion: {best_model['Model_Name']}</h4>
        </div>
        """
    except Exception as e:
        return f"<p>Error loading stats: {str(e)}</p>"

def update_ranking(method):
    """Update leaderboard ranking based on selected method"""
    try:
        current_lb = pd.read_csv(leaderboard_file)
        
        if "Combined_Score" not in current_lb.columns:
            current_lb["Combined_Score"] = current_lb["WER"] * 0.7 + current_lb["CER"] * 0.3
        
        sort_column = "Combined_Score"
        if method == "WER Only":
            sort_column = "WER"
        elif method == "CER Only":
            sort_column = "CER"
        
        return prepare_leaderboard_for_display(current_lb, sort_column)
        
    except Exception as e:
        print(f"Error updating ranking: {str(e)}")
        return pd.DataFrame(columns=["Rank", "Model", "WER (%)", "CER (%)", "Combined Score (%)", "Performance", "Type", "Date"])

def compare_models(model_1_name, model_2_name):
    """Compare two models performance"""
    try:
        df = pd.read_csv(leaderboard_file)
        
        if model_1_name == model_2_name:
            return pd.DataFrame([{"Info": "Please select two different models to compare."}])
        
        model_1 = df[df["Model_Name"] == model_1_name]
        model_2 = df[df["Model_Name"] == model_2_name]
        
        if model_1.empty or model_2.empty:
            return pd.DataFrame([{"Info": "One or both models not found in leaderboard."}])
        
        m1 = model_1.iloc[0]
        m2 = model_2.iloc[0]
        
        comparison_data = {
            "Metric": ["WER", "CER", "Combined Score"],
            model_1_name.split("/")[-1]: [
                f"{m1['WER']*100:.2f}%",
                f"{m1['CER']*100:.2f}%", 
                f"{m1['Combined_Score']*100:.2f}%"
            ],
            model_2_name.split("/")[-1]: [
                f"{m2['WER']*100:.2f}%",
                f"{m2['CER']*100:.2f}%",
                f"{m2['Combined_Score']*100:.2f}%"
            ],
            "Difference": [
                f"{(m1['WER'] - m2['WER'])*100:+.2f}%",
                f"{(m1['CER'] - m2['CER'])*100:+.2f}%",
                f"{(m1['Combined_Score'] - m2['Combined_Score'])*100:+.2f}%"
            ]
        }
        
        return pd.DataFrame(comparison_data)
        
    except Exception as e:
        return pd.DataFrame([{"Error": f"Error comparing models: {str(e)}"}])

def process_submission(model_name, csv_file, model_type, origin_country):
    """Process a new model submission with enhanced metadata"""
    if not model_name or not model_name.strip():
        return "❌ **Error:** Please provide a model name.", None, None
        
    if not csv_file:
        return "❌ **Error:** Please upload a CSV file.", None, None
    
    try:
        df = pd.read_csv(csv_file)
        
        if len(df) == 0:
            return "❌ **Error:** Uploaded CSV is empty.", None, None
            
        if set(df.columns) != {"id", "text"}:
            return f"❌ **Error:** CSV must contain exactly 'id' and 'text' columns. Found: {', '.join(df.columns)}", None, None
            
        if df["id"].duplicated().any():
            dup_ids = df[df["id"].duplicated()]["id"].unique()
            return f"❌ **Error:** Duplicate IDs found: {', '.join(map(str, dup_ids[:5]))}", None, None

        missing_ids = set(references.keys()) - set(df["id"])
        extra_ids = set(df["id"]) - set(references.keys())
        
        if missing_ids:
            return f"❌ **Error:** Missing {len(missing_ids)} IDs in submission. First few missing: {', '.join(map(str, list(missing_ids)[:5]))}", None, None
            
        if extra_ids:
            return f"❌ **Error:** Found {len(extra_ids)} extra IDs not in reference dataset. First few extra: {', '.join(map(str, list(extra_ids)[:5]))}", None, None
        
        try:
            avg_wer, avg_cer, weighted_wer, weighted_cer, detailed_results = calculate_metrics(df)
            
            if avg_wer < 0.001:
                return "❌ **Error:** WER calculation yielded suspicious results (near-zero). Please check your submission CSV.", None, None
                
        except Exception as e:
            return f"❌ **Error calculating metrics:** {str(e)}", None, None
        
        # Update leaderboard
        leaderboard = pd.read_csv(leaderboard_file)
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        combined_score = avg_wer * 0.7 + avg_cer * 0.3
        
        if model_name in leaderboard["Model_Name"].values:
            idx = leaderboard[leaderboard["Model_Name"] == model_name].index
            leaderboard.loc[idx, "WER"] = avg_wer
            leaderboard.loc[idx, "CER"] = avg_cer
            leaderboard.loc[idx, "Combined_Score"] = combined_score
            leaderboard.loc[idx, "timestamp"] = timestamp
            leaderboard.loc[idx, "Type"] = model_type
            leaderboard.loc[idx, "Origin"] = origin_country
            updated_leaderboard = leaderboard
        else:
            new_entry = pd.DataFrame(
                [[model_name, avg_wer, avg_cer, combined_score, timestamp, model_type, origin_country, "ASR"]],
                columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp", "Type", "Origin", "Task"]
            )
            updated_leaderboard = pd.concat([leaderboard, new_entry])
        
        updated_leaderboard = updated_leaderboard.sort_values("Combined_Score")
        updated_leaderboard.to_csv(leaderboard_file, index=False)
        
        display_leaderboard = prepare_leaderboard_for_display(updated_leaderboard)
        chart = create_performance_chart()
        
        badge = get_performance_badge(combined_score)
        
        success_msg = f"""
        ✅ **Submission processed successfully!**
        
        **{model_name}** ({model_type} from {origin_country})
        - **WER:** {format_as_percentage(avg_wer)}
        - **CER:** {format_as_percentage(avg_cer)} 
        - **Combined Score:** {format_as_percentage(combined_score)}
        - **Performance:** {badge}
        """
        
        return success_msg, display_leaderboard, chart
        
    except Exception as e:
        return f"❌ **Error processing submission:** {str(e)}", None, None

def get_current_leaderboard():
    """Get the current leaderboard data for display"""
    try:
        if os.path.exists(leaderboard_file):
            current_leaderboard = pd.read_csv(leaderboard_file)
            
            # Ensure all required columns exist
            required_columns = ["Combined_Score", "Type", "Origin", "Task"]
            for col in required_columns:
                if col not in current_leaderboard.columns:
                    if col == "Combined_Score":
                        current_leaderboard[col] = current_leaderboard["WER"] * 0.7 + current_leaderboard["CER"] * 0.3
                    else:
                        current_leaderboard[col] = "Unknown" if col != "Task" else "ASR"
            
            current_leaderboard.to_csv(leaderboard_file, index=False)
            return current_leaderboard
        else:
            return pd.DataFrame(columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp", "Type", "Origin", "Task"])
    except Exception as e:
        print(f"Error getting leaderboard: {str(e)}")
        return pd.DataFrame(columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp", "Type", "Origin", "Task"])

def create_leaderboard_table():
    """Create and format the leaderboard table for display"""
    leaderboard_data = get_current_leaderboard()
    return prepare_leaderboard_for_display(leaderboard_data)

def df_to_html(df):
    """Convert DataFrame to HTML with custom styling"""
    if df.empty:
        return "<p style='text-align: center; color: #666;'>No data available</p>"
    
    # Convert DataFrame to HTML
    html = df.to_html(index=False, escape=False, classes="leaderboard-table")
    
    # Add custom styling
    html = html.replace('<table class="leaderboard-table"', 
                       '<table class="leaderboard-table" style="width: 100%; margin: 0 auto;"')
    
    return html

# Main Gradio Interface
with gr.Blocks(
    title="🇲🇱 Bambara ASR Leaderboard | MALIBA-AI", 
    css=sahara_style_css, 
    head=custom_head_html,
    theme=gr.themes.Soft()
) as demo:
    
    # Header Section
    gr.HTML(new_header_html)
    
    # Navigation Buttons
    with gr.Row():
        gr.Button("🌐 MALIBA-AI Website", link="https://maliba-ai.org/", elem_classes=['nav-button'])
        gr.Button("📊 HF Dataset Repo", link="https://huggingface.co/datasets/sudoping01/bambara-speech-recognition-benchmark", elem_classes=['nav-button'])
        gr.Button("🤗 MALIBA-AI Hub", link="https://huggingface.co/MALIBA-AI", elem_classes=['nav-button'])
        gr.Button("📚 Documentation", link="https://huggingface.co/spaces/MALIBA-AI/bambara-asr-leaderboard", elem_classes=['nav-button'])
    
    with gr.Group(elem_classes="content-card"):
        # Stats display
        stats_html = gr.HTML(get_leaderboard_stats())
        
        with gr.Tabs() as tabs:
            with gr.TabItem("🏅 Main Leaderboard", id="main"):
                gr.HTML("<h2>Main Leaderboard</h2>")
                
                initial_leaderboard = create_leaderboard_table()
                
                with gr.Row():
                    ranking_method = gr.Radio(
                        ["Combined Score (WER 70%, CER 30%)", "WER Only", "CER Only"], 
                        label="🔄 Ranking Method",
                        value="Combined Score (WER 70%, CER 30%)",
                        info="Choose how to rank the models"
                    )
                
                leaderboard_view = gr.DataFrame(
                    value=initial_leaderboard,
                    interactive=False,
                    label="📋 Leaderboard Rankings - Lower scores indicate better performance",
                    wrap=True,
                    height=400
                )
                
                # Performance chart
                gr.Markdown("### 📊 Visual Performance Comparison")
                performance_chart = gr.Plot(
                    value=create_performance_chart(),
                    label="Model Performance Visualization"
                )
                
                ranking_method.change(
                    fn=update_ranking,
                    inputs=[ranking_method],
                    outputs=[leaderboard_view]
                )
                
                with gr.Accordion("📖 Understanding ASR Metrics", open=False):
                    gr.Markdown("""
                    ## 🎯 Automatic Speech Recognition Evaluation Metrics
                    
                    ### Word Error Rate (WER)
                    **WER** measures transcription accuracy at the word level:
                    - **Formula:** `(Substitutions + Insertions + Deletions) / Total Reference Words`
                    - **Range:** 0% (perfect) to 100%+ (very poor)
                    - **Interpretation:** 
                      - 0-5%: 🏆 Excellent performance
                      - 5-15%: 🥉 Good performance  
                      - 15-30%: 📈 Fair performance
                      - 30%+: Poor performance
                    
                    ### Character Error Rate (CER)
                    **CER** measures transcription accuracy at the character level:
                    - **Advantage:** More granular than WER, captures partial matches
                    - **Benefit for Bambara:** Particularly valuable for agglutinative languages
                    - **Typical Range:** Usually lower than WER values
                    
                    ### Combined Score (Primary Ranking Metric)
                    **Formula:** `Combined Score = 0.7 × WER + 0.3 × CER`
                    - **Rationale:** Balanced evaluation emphasizing word-level accuracy
                    - **Usage:** Primary metric for model ranking
                    
                    ### 🎯 Performance Categories
                    - 🏆 **Excellent**: < 15% Combined Score
                    - 🥉 **Good**: 15-30% Combined Score
                    - 📈 **Fair**: > 30% Combined Score
                    """)
            
            with gr.TabItem("📤 Submit New Model", id="submit"):
                gr.HTML("<h2>Submit Your Bambara ASR Model</h2>")
                
                gr.Markdown("""
                ### 🚀 Ready to benchmark your model? Submit your results and join the leaderboard!
                
                Follow these steps to submit your Bambara ASR model for evaluation.
                """)
                
                with gr.Group(elem_classes="form-section"):
                    with gr.Row():
                        with gr.Column(scale=2):
                            model_name_input = gr.Textbox(
                                label="🤖 Model Name", 
                                placeholder="e.g., MALIBA-AI/bambara-whisper-large",
                                info="Use a descriptive name (organization/model format preferred)"
                            )
                            
                            model_type = gr.Dropdown(
                                label="🏷️ Model Type",
                                choices=["Whisper-based", "Wav2Vec2", "Foundation", "Custom", "Fine-tuned", "Multilingual", "Other"],
                                value="Custom",
                                info="Select the type/architecture of your model"
                            )
                            
                            origin_country = gr.Dropdown(
                                label="🌍 Origin/Institution",
                                choices=["Mali", "Senegal", "Burkina Faso", "Niger", "Guinea", "Ivory Coast", "USA", "France", "Canada", "UK", "Other"],
                                value="Mali",
                                info="Country or region of the developing institution"
                            )
                        
                        with gr.Column(scale=1):
                            gr.Markdown("""
                            #### 📋 Submission Requirements
                            
                            **CSV Format:**
                            - Columns: `id`, `text`
                            - Match all reference dataset IDs
                            - No duplicate IDs
                            - Text transcriptions in Bambara
                            
                            **Data Quality:**
                            - Clean, normalized text
                            - Consistent formatting
                            - Complete coverage of test set
                            """)
                
                csv_upload = gr.File(
                    label="📁 Upload Predictions CSV", 
                    file_types=[".csv"],
                    info="Upload your model's transcriptions in the required CSV format"
                )
                
                submit_btn = gr.Button("🚀 Submit Model", variant="primary", size="lg", elem_classes=['gradio-button', 'primary'])
                
                output_msg = gr.Markdown(label="📢 Submission Status")
                
                with gr.Row():
                    leaderboard_display = gr.DataFrame(
                        label="📊 Updated Leaderboard",
                        value=initial_leaderboard,
                        interactive=False,
                        wrap=True,
                        height=400
                    )
                    
                    updated_chart = gr.Plot(
                        label="📈 Updated Performance Chart"
                    )
                
                submit_btn.click(
                    fn=process_submission,
                    inputs=[model_name_input, csv_upload, model_type, origin_country],
                    outputs=[output_msg, leaderboard_display, updated_chart]
                )
            
            with gr.TabItem("🔍 Compare Models", id="compare"):
                gr.HTML("<h2>Compare Two Models</h2>")
                
                gr.Markdown("### Select two models to compare their performance side-by-side")
                
                with gr.Row():
                    current_data = get_current_leaderboard()
                    model_names = current_data["Model_Name"].tolist() if not current_data.empty else []
                    
                    model_1_dropdown = gr.Dropdown(
                        choices=model_names,
                        label="🤖 Model 1",
                        info="Select the first model for comparison"
                    )
                    model_2_dropdown = gr.Dropdown(
                        choices=model_names,
                        label="🤖 Model 2", 
                        info="Select the second model for comparison"
                    )
                
                compare_btn = gr.Button("⚡ Compare Models", variant="primary", elem_classes=['gradio-button', 'primary'])
                
                comparison_note = gr.Markdown("""
                **Note on Comparison Results:**
                - Positive difference values (🟢) indicate Model 1 performed better
                - Negative difference values (🔴) indicate Model 2 performed better
                - Lower error rates indicate better performance
                """, visible=False)
                
                comparison_output = gr.DataFrame(
                    label="📊 Model Comparison Results",
                    value=pd.DataFrame([{"Info": "Select two models and click Compare to see the results."}]),
                    interactive=False
                )
                
                def update_comparison_table(m1, m2):
                    if not m1 or not m2:
                        return gr.update(visible=False), pd.DataFrame([{"Info": "Please select both models before clicking Compare."}])
                    
                    if m1 == m2:
                        return gr.update(visible=False), pd.DataFrame([{"Info": "Please select two different models to compare."}])
                    
                    df = compare_models(m1, m2)
                    return gr.update(visible=True), df
                
                compare_btn.click(
                    fn=update_comparison_table,
                    inputs=[model_1_dropdown, model_2_dropdown],
                    outputs=[comparison_note, comparison_output]
                )
            
            with gr.TabItem("📊 Dataset & Methodology", id="dataset"):
                gr.HTML("<h2>Dataset & Methodology</h2>")
                
                gr.Markdown("""
                ## 🎯 About the Bambara Speech Recognition Benchmark
                
                ### 📈 Dataset Overview
                
                Our benchmark is built on the **`sudoping01/bambara-speech-recognition-benchmark`** dataset, featuring:
                
                - **🎙️ Diverse Audio Samples:** Various speakers, dialects, and recording conditions
                - **🗣️ Speaker Variety:** Multiple native Bambara speakers from different regions
                - **🎵 Acoustic Diversity:** Different recording environments and quality levels
                - **✅ Quality Assurance:** Manually validated transcriptions
                - **📚 Content Variety:** Multiple domains and speaking styles
                
                ### 🔬 Evaluation Methodology
                
                #### Text Normalization Process
                1. **Lowercase conversion** for consistency
                2. **Punctuation removal** to focus on linguistic content  
                3. **Whitespace normalization** for standardized formatting
                4. **Unicode normalization** for proper character handling
                
                #### Quality Controls
                - **Outlier Detection:** Extreme error rates are capped to prevent skewing
                - **Data Validation:** Comprehensive format and completeness checks
                - **Duplicate Prevention:** Automatic detection of duplicate submissions
                - **Missing Data Handling:** Identification of incomplete submissions
                
                ### 🚀 How to Participate
                
                #### Step 1: Access the Dataset
                ```python
                from datasets import load_dataset
                dataset = load_dataset("sudoping01/bambara-speech-recognition-benchmark")
                ```
                
                #### Step 2: Generate Predictions
                - Process the audio files with your ASR model
                - Generate transcriptions for each audio sample
                - Ensure your model outputs text in Bambara language
                
                #### Step 3: Format Results
                Create a CSV file with exactly these columns:
                - **`id`**: Sample identifier (must match dataset IDs)
                - **`text`**: Your model's transcription
                
                #### Step 4: Submit & Evaluate
                - Upload your CSV using the submission form
                - Your model will be automatically evaluated
                - Results appear on the leaderboard immediately
                
                ### 🏆 Recognition & Impact
                
                **Top-performing models will be:**
                - Featured prominently on our leaderboard
                - Highlighted in MALIBA-AI communications
                - Considered for inclusion in production systems
                - Invited to present at community events
                
                ### 🤝 Community Guidelines
                
                - **Reproducibility:** Please provide model details and methodology
                - **Fair Play:** No data leakage or unfair advantages
                - **Collaboration:** Share insights and learnings with the community
                - **Attribution:** Properly cite the benchmark in publications
                
                ### 📚 Technical Specifications
                
                | Aspect | Details |
                |--------|---------|
                | **Audio Format** | WAV, various sample rates |
                | **Language** | Bambara (bam) |
                | **Evaluation Metrics** | WER, CER, Combined Score |
                | **Text Encoding** | UTF-8 |
                | **Submission Format** | CSV with id, text columns |
                """)
    
    # Citation and Footer
    with gr.Group(elem_classes="content-card"):
        gr.HTML("""
        <div class="citation-block">
            <h2>📚 Citation</h2>
            <p>If you use the Bambara ASR Leaderboard for your scientific publication, or if you find the resources useful, please cite our work:</p>
            <pre>
@misc{bambara_asr_leaderboard_2025,
  title={Bambara Speech Recognition Leaderboard},
  author={MALIBA-AI Team},
  year={2025},
  url={https://huggingface.co/spaces/MALIBA-AI/bambara-asr-leaderboard},
  note={A community initiative for advancing Bambara speech recognition technology}
}
            </pre>
        </div>
        """)
        
        gr.HTML("""
        <div style="text-align: center; margin-top: 30px; padding-top: 20px; border-top: 2px solid #e9ecef;">
            <h3 style="color: #7d3561; margin-bottom: 15px;">About MALIBA-AI</h3>
            <p style="font-size: 16px; line-height: 1.6; max-width: 800px; margin: 0 auto;">
                <strong>MALIBA-AI: Empowering Mali's Future Through Community-Driven AI Innovation</strong><br>
                <em>"No Malian Language Left Behind"</em>
            </p>
            <p style="margin-top: 15px;">
                This leaderboard is maintained by the MALIBA-AI initiative to track progress in Bambara speech recognition technology.
                For more information, visit <a href="https://maliba-ai.org/" style="color: #7d3561; font-weight: 600;">MALIBA-AI</a> or 
                <a href="https://huggingface.co/MALIBA-AI" style="color: #7d3561; font-weight: 600;">our Hugging Face page</a>.
            </p>
            <div style="margin-top: 20px;">
                <span style="font-size: 2em;">🇲🇱</span>
                <span style="margin: 0 20px; color: #7d3561; font-weight: 600;">•</span>
                <span style="font-size: 2em;">🤝</span>
                <span style="margin: 0 20px; color: #7d3561; font-weight: 600;">•</span>
                <span style="font-size: 2em;">🚀</span>
            </div>
        </div>
        """)

if __name__ == "__main__":
    demo.launch()