Spaces:

nvidia
/

Speech-IQ-leaderboard

Running

File size: 7,949 Bytes

import gradio as gr
import pandas as pd
import numpy as np

from src.about import (
    CITATION_BUTTON_LABEL,
    CITATION_BUTTON_TEXT,
    EVALUATION_QUEUE_TEXT,
    INTRODUCTION_TEXT,
    LLM_BENCHMARKS_TEXT,
    TITLE,
)
from src.display.css_html_js import custom_css

def load_speechiq_data():
    """Load and process the SpeechIQ results from CSV file."""
    try:
        df = pd.read_csv("SpeechIQ_table.csv")
        
        # Round numerical columns to 3 decimal places for better display
        numerical_cols = ['Remember', 'Understand', 'Apply', 'Speech IQ']
        for col in numerical_cols:
            if col in df.columns:
                df[col] = df[col].round(3)
        
        # Sort by Speech IQ score in descending order
        df = df.sort_values('Speech IQ', ascending=False)
        
        # Add ranking with medal emojis
        df['Rank'] = ''
        for i in range(len(df)):
            if i == 0:
                df.iloc[i, df.columns.get_loc('Rank')] = '🥇'
            elif i == 1:
                df.iloc[i, df.columns.get_loc('Rank')] = '🥈'
            elif i == 2:
                df.iloc[i, df.columns.get_loc('Rank')] = '🥉'
            else:
                df.iloc[i, df.columns.get_loc('Rank')] = f'{i+1}'
        
        # Reorder columns to put Speech IQ first, then Rank
        column_order = ['Rank', 'Speech IQ', 'Remember', 'Understand', 'Apply', 'Model Type', 'Setup', 'Audio Encoder']
        df = df[column_order]
        
        return df
    except Exception as e:
        print(f"Error loading SpeechIQ data: {e}")
        # Return empty dataframe with expected columns if file not found
        return pd.DataFrame(columns=['Rank', 'Speech IQ', 'Remember', 'Understand', 'Apply', 'Model Type', 'Setup', 'Audio Encoder'])

def get_top_performers(df):
    """Get statistics about top performers."""
    if df.empty:
        return "No data available."
    
    top_score = df['Speech IQ'].max()
    top_model = df.loc[df['Speech IQ'].idxmax()]
    
    agentic_best = df[df['Model Type'].str.contains('Agentic', na=False)]['Speech IQ'].max() if not df[df['Model Type'].str.contains('Agentic', na=False)].empty else 0
    end2end_best = df[df['Model Type'].str.contains('End2End', na=False)]['Speech IQ'].max() if not df[df['Model Type'].str.contains('End2End', na=False)].empty else 0
    
    stats_text = f"""
    ## 📊 Leaderboard Statistics
    
    | Metric | Value |
    |--------|-------|
    | 🏆 **Top Performer** | {top_model['Setup']} |
    | 🎯 **Highest Score** | **{top_score}** |
    | 🤖 **Best Agentic Model** | {agentic_best} |
    | 🔄 **Best End2End Model** | {end2end_best} |
    | 📈 **Total Models** | {len(df)} |
    """
    
    return stats_text

# Load the data
speechiq_df = load_speechiq_data()

# Create the Gradio interface
demo = gr.Blocks(css=custom_css, title="SpeechIQ Leaderboard")

with demo:
    gr.HTML(TITLE)
    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.TabItem("🏅 SpeechIQ Leaderboard", elem_id="speechiq-leaderboard-tab", id=0):
            
            
            
            # Main leaderboard table
            with gr.Row():
                leaderboard_table = gr.Dataframe(
                    value=speechiq_df,
                    headers=speechiq_df.columns.tolist() if not speechiq_df.empty else ['rank', 'SIQ', 'remember', 'understand', 'apply', 'moddel', 'setup', 'audio-enc'],
                    interactive=False,
                    elem_classes="leaderboard-table"
                )

            # Legend and explanation
            with gr.Row():
                gr.Markdown("""
                ### 📋 Column Explanations
                
                - **Rank**: Position ranking with 🥇🥈🥉 medals for top 3 performers
                - **Speech IQ**: Overall intelligence quotient combining all dimensions (primary metric)
                - **Remember**: Verbatim accuracy score (WER-based)
                - **Understand**: Semantic interpretation similarity score
                - **Apply**: Downstream task performance score
                - **Model Type**: Architecture approach (Agentic vs End2End)
                - **Setup**: Specific model configuration and components
                - **Audio Encoder**: The audio processing component used
                
                *Higher scores indicate better performance across all metrics.*
                """, elem_classes="markdown-text")

            
            
            

        with gr.TabItem("📊 Analysis", elem_id="analysis-tab", id=1):
            with gr.Row():
                # Create performance comparison charts
                if not speechiq_df.empty:
                    # Group by model type for comparison
                    agentic_models = speechiq_df[speechiq_df['Model Type'].str.contains('Agentic', na=False)]
                    end2end_models = speechiq_df[speechiq_df['Model Type'].str.contains('End2End', na=False)]
                    
                    comparison_text = f"""
                    ### 🔍 Model Type Comparison
                    
                    **Agentic Models (ASR + LLM):**
                    - Count: {len(agentic_models)}
                    - Average Speech IQ: {agentic_models['Speech IQ'].mean():.2f}
                    - Best Score: {agentic_models['Speech IQ'].max():.2f}
                    
                    **End-to-End Models:**
                    - Count: {len(end2end_models)}
                    - Average Speech IQ: {end2end_models['Speech IQ'].mean():.2f}
                    - Best Score: {end2end_models['Speech IQ'].max():.2f}
                    
                    ### 🎯 Cognitive Dimension Analysis
                    
                    **Remember (Verbatim Accuracy):**
                    - Best performer: {speechiq_df.loc[speechiq_df['Remember'].idxmax(), 'Setup']} ({speechiq_df['Remember'].max():.3f})
                    
                    **Understand (Semantic Similarity):**
                    - Best performer: {speechiq_df.loc[speechiq_df['Understand'].idxmax(), 'Setup']} ({speechiq_df['Understand'].max():.3f})
                    
                    **Apply (Task Performance):**
                    - Best performer: {speechiq_df.loc[speechiq_df['Apply'].idxmax(), 'Setup']} ({speechiq_df['Apply'].max():.3f})
                    """
                    
                    gr.Markdown(comparison_text, elem_classes="markdown-text")
                else:
                    gr.Markdown("No data available for analysis.", elem_classes="markdown-text")

    # Statistics section - moved after table
        with gr.Row():
            gr.Markdown(get_top_performers(speechiq_df), elem_classes="markdown-text stats-section")

        with gr.TabItem("📝 About", elem_id="about-tab", id=2):
            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")

        with gr.TabItem("🚀 Submit", elem_id="submit-tab", id=3):
            gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")

    # Citation section
    with gr.Row():
        with gr.Accordion("📙 Citation", open=False):
            citation_button = gr.Textbox(
                value=CITATION_BUTTON_TEXT,
                label=CITATION_BUTTON_LABEL,
                lines=6,
                elem_id="citation-button",
                show_copy_button=True,
            )

    # Add refresh functionality
    with gr.Row():
        refresh_button = gr.Button("🔄 Refresh Data", variant="secondary")
        
        def refresh_data():
            updated_df = load_speechiq_data()
            return updated_df
        
        refresh_button.click(
            refresh_data,
            outputs=leaderboard_table
        )

if __name__ == "__main__":
    demo.launch(share=False, server_name="0.0.0.0", server_port=7860)