huckiyang's picture
[release] speechIQ layout imprv-v4
28e6718
import gradio as gr
import pandas as pd
import numpy as np
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
EVALUATION_QUEUE_TEXT,
INTRODUCTION_TEXT,
LLM_BENCHMARKS_TEXT,
TITLE,
)
from src.display.css_html_js import custom_css
def load_speechiq_data():
"""Load and process the SpeechIQ results from CSV file."""
try:
df = pd.read_csv("SpeechIQ_table.csv")
# Round numerical columns to 3 decimal places for better display
numerical_cols = ['Remember', 'Understand', 'Apply', 'Speech IQ']
for col in numerical_cols:
if col in df.columns:
df[col] = df[col].round(3)
# Sort by Speech IQ score in descending order
df = df.sort_values('Speech IQ', ascending=False)
# Add ranking with medal emojis
df['Rank'] = ''
for i in range(len(df)):
if i == 0:
df.iloc[i, df.columns.get_loc('Rank')] = 'πŸ₯‡'
elif i == 1:
df.iloc[i, df.columns.get_loc('Rank')] = 'πŸ₯ˆ'
elif i == 2:
df.iloc[i, df.columns.get_loc('Rank')] = 'πŸ₯‰'
else:
df.iloc[i, df.columns.get_loc('Rank')] = f'{i+1}'
# Reorder columns to put Speech IQ first, then Rank
column_order = ['Rank', 'Speech IQ', 'Remember', 'Understand', 'Apply', 'Model Type', 'Setup', 'Audio Encoder']
df = df[column_order]
return df
except Exception as e:
print(f"Error loading SpeechIQ data: {e}")
# Return empty dataframe with expected columns if file not found
return pd.DataFrame(columns=['Rank', 'Speech IQ', 'Remember', 'Understand', 'Apply', 'Model Type', 'Setup', 'Audio Encoder'])
def get_top_performers(df):
"""Get statistics about top performers."""
if df.empty:
return "No data available."
top_score = df['Speech IQ'].max()
top_model = df.loc[df['Speech IQ'].idxmax()]
agentic_best = df[df['Model Type'].str.contains('Agentic', na=False)]['Speech IQ'].max() if not df[df['Model Type'].str.contains('Agentic', na=False)].empty else 0
end2end_best = df[df['Model Type'].str.contains('End2End', na=False)]['Speech IQ'].max() if not df[df['Model Type'].str.contains('End2End', na=False)].empty else 0
stats_text = f"""
## πŸ“Š Leaderboard Statistics
| Metric | Value |
|--------|-------|
| πŸ† **Top Performer** | {top_model['Setup']} |
| 🎯 **Highest Score** | **{top_score}** |
| πŸ€– **Best Agentic Model** | {agentic_best} |
| πŸ”„ **Best End2End Model** | {end2end_best} |
| πŸ“ˆ **Total Models** | {len(df)} |
"""
return stats_text
# Load the data
speechiq_df = load_speechiq_data()
# Create the Gradio interface
demo = gr.Blocks(css=custom_css, title="SpeechIQ Leaderboard")
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("πŸ… SpeechIQ Leaderboard", elem_id="speechiq-leaderboard-tab", id=0):
# Main leaderboard table
with gr.Row():
leaderboard_table = gr.Dataframe(
value=speechiq_df,
headers=speechiq_df.columns.tolist() if not speechiq_df.empty else ['rank', 'SIQ', 'remember', 'understand', 'apply', 'moddel', 'setup', 'audio-enc'],
interactive=False,
elem_classes="leaderboard-table"
)
# Legend and explanation
with gr.Row():
gr.Markdown("""
### πŸ“‹ Column Explanations
- **Rank**: Position ranking with πŸ₯‡πŸ₯ˆπŸ₯‰ medals for top 3 performers
- **Speech IQ**: Overall intelligence quotient combining all dimensions (primary metric)
- **Remember**: Verbatim accuracy score (WER-based)
- **Understand**: Semantic interpretation similarity score
- **Apply**: Downstream task performance score
- **Model Type**: Architecture approach (Agentic vs End2End)
- **Setup**: Specific model configuration and components
- **Audio Encoder**: The audio processing component used
*Higher scores indicate better performance across all metrics.*
""", elem_classes="markdown-text")
with gr.TabItem("πŸ“Š Analysis", elem_id="analysis-tab", id=1):
with gr.Row():
# Create performance comparison charts
if not speechiq_df.empty:
# Group by model type for comparison
agentic_models = speechiq_df[speechiq_df['Model Type'].str.contains('Agentic', na=False)]
end2end_models = speechiq_df[speechiq_df['Model Type'].str.contains('End2End', na=False)]
comparison_text = f"""
### πŸ” Model Type Comparison
**Agentic Models (ASR + LLM):**
- Count: {len(agentic_models)}
- Average Speech IQ: {agentic_models['Speech IQ'].mean():.2f}
- Best Score: {agentic_models['Speech IQ'].max():.2f}
**End-to-End Models:**
- Count: {len(end2end_models)}
- Average Speech IQ: {end2end_models['Speech IQ'].mean():.2f}
- Best Score: {end2end_models['Speech IQ'].max():.2f}
### 🎯 Cognitive Dimension Analysis
**Remember (Verbatim Accuracy):**
- Best performer: {speechiq_df.loc[speechiq_df['Remember'].idxmax(), 'Setup']} ({speechiq_df['Remember'].max():.3f})
**Understand (Semantic Similarity):**
- Best performer: {speechiq_df.loc[speechiq_df['Understand'].idxmax(), 'Setup']} ({speechiq_df['Understand'].max():.3f})
**Apply (Task Performance):**
- Best performer: {speechiq_df.loc[speechiq_df['Apply'].idxmax(), 'Setup']} ({speechiq_df['Apply'].max():.3f})
"""
gr.Markdown(comparison_text, elem_classes="markdown-text")
else:
gr.Markdown("No data available for analysis.", elem_classes="markdown-text")
# Statistics section - moved after table
with gr.Row():
gr.Markdown(get_top_performers(speechiq_df), elem_classes="markdown-text stats-section")
with gr.TabItem("πŸ“ About", elem_id="about-tab", id=2):
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
with gr.TabItem("πŸš€ Submit", elem_id="submit-tab", id=3):
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
# Citation section
with gr.Row():
with gr.Accordion("πŸ“™ Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=6,
elem_id="citation-button",
show_copy_button=True,
)
# Add refresh functionality
with gr.Row():
refresh_button = gr.Button("πŸ”„ Refresh Data", variant="secondary")
def refresh_data():
updated_df = load_speechiq_data()
return updated_df
refresh_button.click(
refresh_data,
outputs=leaderboard_table
)
if __name__ == "__main__":
demo.launch(share=False, server_name="0.0.0.0", server_port=7860)