Spaces:

Allanatrix
/

NexaEvals

Running

App Files Files Community

Allanatrix commited on 18 days ago

Commit

08d1f1b

verified ·

1 Parent(s): fb054fc

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -123

app.py CHANGED Viewed

@@ -1,59 +1,39 @@
 import gradio as gr
-import plotly.graph_objects as go
-import json
-# Data for tabular models
 TABULAR_MODEL_EVALS = {
     "Proteins": {
-        "Nexa Bio1 (Secondary)": 0.71,
-        "Porter6 (Secondary)": 0.8456,
-        "DeepCNF (Secondary)": 0.85,
-        "AlphaFold2 (Tertiary GDT-TS)": 0.924,
-        "Nexa Bio2 (Tertiary)": 0.90,
     },
     "Astro": {
-        "Nexa Astro": 0.97,
-        "Baseline CNN": 0.89,
     },
     "Materials": {
-        "Nexa Materials": 0.9999,
-        "Random Forest Baseline": 0.92,
     },
     "QST": {
-        "Nexa PIN Model": 0.80,
-        "Quantum TomoNet": 0.85,
     },
     "HEP": {
-        "Nexa HEP Model": 0.91,
-        "CMSNet": 0.94,
     },
     "CFD": {
-        "Nexa CFD Model": 0.92,
-        "FlowNet": 0.89,
     },
 }
-# Data for LLMs
-LLM_MODEL_EVALS = {
-    "LLM (General OSIR)": {
-        "Nexa Mistral Sci-7B": 0.61,
-        "Llama-3-8B-Instruct": 0.39,
-        "Mixtral-8x7B-Instruct-v0.1": 0.41,
-        "Claude-3-Sonnet": 0.64,
-        "GPT-4-Turbo": 0.68,
-        "GPT-4o": 0.71,
-    },
-    "LLM (Field-Specific OSIR)": {
-        "Nexa Bio Adapter": 0.66,
-        "Nexa Astro Adapter": 0.70,
-        "GPT-4o (Biomed)": 0.69,
-        "Claude-3-Opus (Bio)": 0.67,
-        "Llama-3-8B-Bio": 0.42,
-        "Mixtral-8x7B-BioTune": 0.43,
-    },
-}
-# Data for Nexa Mistral Sci-7B Evaluation (based on the provided image)
 NEXA_MISTRAL_EVALS = {
     "Nexa Mistral Sci-7B": {
         "Scientific Utility": {"OSIR (General)": 7.0, "OSIR-Field (Physics)": 8.5},
@@ -66,86 +46,55 @@ NEXA_MISTRAL_EVALS = {
     }
 }
-# Universal plotting function with highlighted Nexa models
-def plot_horizontal_bar(domain, data, highlight_keyword="Nexa", highlight_color='indigo', default_color='lightgray'):
-    sorted_items = sorted(data.items(), key=lambda x: x[1], reverse=True)
-    models, scores = zip(*sorted_items)
-    colors = [highlight_color if highlight_keyword in model else default_color for model in models]
-    fig = go.Figure()
-    fig.add_trace(go.Bar(
-        x=scores,
-        y=models,
-        orientation='h',
-        marker_color=colors,
-    ))
-    fig.update_layout(
-        title=f"Model Benchmark Scores — {domain}",
-        xaxis_title="Score",
-        yaxis_title="Model",
-        xaxis_range=[0, 1.0],
-        template="plotly_white",
-        height=500,
-        margin=dict(l=120, r=20, t=40, b=40),
-        yaxis=dict(automargin=True),
-    )
     return fig
-# Plotting function for Nexa Mistral Sci-7B Evaluation
-def plot_mistral_eval(metric):
-    if metric not in NEXA_MISTRAL_EVALS["Nexa Mistral Sci-7B"]:
-        return None, "Invalid metric selected"
-    data = NEXA_MISTRAL_EVALS["Nexa Mistral Sci-7B"][metric]
-    models = list(data.keys())
-    scores = list(data.values())
-    fig = go.Figure()
-    fig.add_trace(go.Bar(
-        x=scores,
-        y=models,
-        orientation='h',
-        marker_color=['yellow', 'orange']  # Matching the provided image colors
-    ))
-    fig.update_layout(
-        title=f"Nexa Mistral Sci-7B Evaluation: {metric}",
-        xaxis_title="Score (1-10)",
-        yaxis_title="Model",
-        xaxis_range=[0, 10],
-        template="plotly_black",
-        height=400,
-        margin=dict(l=120, r=20, t=40, b=40),
-        yaxis=dict(automargin=True),
-        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
-    )
-    return fig
-# Display functions for each section
 def display_tabular_eval(domain):
-    if domain not in TABULAR_MODEL_EVALS:
-        return None, "Invalid domain selected"
-    plot = plot_horizontal_bar(domain, TABULAR_MODEL_EVALS[domain], highlight_color='indigo', default_color='lightgray')
-    details = json.dumps(TABULAR_MODEL_EVALS[domain], indent=2)
-    return plot, details
 def display_llm_eval(domain):
-    if domain not in LLM_MODEL_EVALS:
-        return None, "Invalid domain selected"
-    plot = plot_horizontal_bar(domain, LLM_MODEL_EVALS[domain], highlight_color='lightblue', default_color='gray')
-    details = json.dumps(LLM_MODEL_EVALS[domain], indent=2)
-    return plot, details
 def display_mistral_eval(metric):
-    plot = plot_mistral_eval(metric)
-    details = json.dumps(NEXA_MISTRAL_EVALS["Nexa Mistral Sci-7B"][metric], indent=2)
-    return plot, details
-# Gradio interface with improved styling
-with gr.Blocks(css="body {font-family: 'Inter', sans-serif; background-color: #f0f0f0; color: #333;}") as demo:
     gr.Markdown("""
     # 🔬 Nexa Evals — Scientific ML Benchmark Suite
-    A comprehensive benchmarking suite comparing Nexa models against state-of-the-art models.
     """)
     with gr.Tabs():
@@ -158,11 +107,10 @@ with gr.Blocks(css="body {font-family: 'Inter', sans-serif; background-color: #f
                 )
                 show_tabular_btn = gr.Button("Show Evaluation")
             tabular_plot = gr.Plot(label="Benchmark Plot")
-            tabular_details = gr.Code(label="Raw Scores (JSON)", language="json")
             show_tabular_btn.click(
                 fn=display_tabular_eval,
                 inputs=tabular_domain,
-                outputs=[tabular_plot, tabular_details]
             )
         with gr.TabItem("LLMs"):
@@ -174,11 +122,10 @@ with gr.Blocks(css="body {font-family: 'Inter', sans-serif; background-color: #f
                 )
                 show_llm_btn = gr.Button("Show Evaluation")
             llm_plot = gr.Plot(label="Benchmark Plot")
-            llm_details = gr.Code(label="Raw Scores (JSON)", language="json")
             show_llm_btn.click(
                 fn=display_llm_eval,
                 inputs=llm_domain,
-                outputs=[llm_plot, llm_details]
             )
         with gr.TabItem("Nexa Mistral Sci-7B"):
@@ -190,21 +137,20 @@ with gr.Blocks(css="body {font-family: 'Inter', sans-serif; background-color: #f
                 )
                 show_mistral_btn = gr.Button("Show Evaluation")
             mistral_plot = gr.Plot(label="Benchmark Plot")
-            mistral_details = gr.Code(label="Raw Scores (JSON)", language="json")
             show_mistral_btn.click(
                 fn=display_mistral_eval,
                 inputs=mistral_metric,
-                outputs=[mistral_plot, mistral_details]
             )
-    gr.Markdown("""
-    ---
-    ### ℹ️ About
-    Nexa Evals provides benchmarks for tabular models, language models, and specific evaluations like Nexa Mistral Sci-7B:
-    - **Tabular Models**: Evaluated on domain-specific metrics across fields like Proteins and Astro.
-    - **LLMs**: Assessed using the SciEval benchmark under the OSIR initiative.
-    - **Nexa Mistral Sci-7B**: Compares general (OSIR) and physics-specific (OSIR-Field) performance across multiple metrics.
-    Scores are normalized where applicable (0-1 for tabular/LLMs, 1-10 for Mistral).
-    """)
 demo.launch()

 import gradio as gr
+import matplotlib.pyplot as plt
+import numpy as np
+# Data for Tabular Models (normalized to 0-10 from original 0-1 data)
 TABULAR_MODEL_EVALS = {
     "Proteins": {
+        "Nexa Bio1 (Secondary)": 7.1,
+        "Porter6 (Secondary)": 8.5,
+        "DeepCNF (Secondary)": 8.5,
+        "AlphaFold2 (Tertiary GDT-TS)": 9.2,
+        "Nexa Bio2 (Tertiary)": 9.0,
     },
     "Astro": {
+        "Nexa Astro": 9.7,
+        "Baseline CNN": 8.9,
     },
     "Materials": {
+        "Nexa Materials": 10.0,
+        "Random Forest Baseline": 9.2,
     },
     "QST": {
+        "Nexa PIN Model": 8.0,
+        "Quantum TomoNet": 8.5,
     },
     "HEP": {
+        "Nexa HEP Model": 9.1,
+        "CMSNet": 9.4,
     },
     "CFD": {
+        "Nexa CFD Model": 9.2,
+        "FlowNet": 8.9,
     },
 }
+# Data for Nexa Mistral Sci-7B Evaluation (from your image)
 NEXA_MISTRAL_EVALS = {
     "Nexa Mistral Sci-7B": {
         "Scientific Utility": {"OSIR (General)": 7.0, "OSIR-Field (Physics)": 8.5},
     }
 }
+# Plotting function using Matplotlib
+def plot_comparison(domain, data_type):
+    if data_type == "mistral":
+        metric = domain
+        data = NEXA_MISTRAL_EVALS["Nexa Mistral Sci-7B"][metric]
+        models = list(data.keys())
+        scores = list(data.values())
+        fig, ax = plt.subplots(figsize=(8, 6), facecolor='#e0e0e0')
+        y_pos = np.arange(len(models))
+        width = 0.35
+        ax.barh(y_pos - width/2, scores[:1], width, label=models[0], color='yellow')
+        ax.barh(y_pos + width/2, scores[1:], width, label=models[1], color='orange')
+    else:
+        data = TABULAR_MODEL_EVALS[domain] if data_type == "tabular" else LLM_MODEL_EVALS[domain]
+        models = list(data.keys())
+        scores = list(data.values())
+        fig, ax = plt.subplots(figsize=(8, 6), facecolor='#e0e0e0')
+        y_pos = np.arange(len(models))
+        width = 0.8
+        colors = ['indigo' if 'Nexa' in model else 'lightgray' if data_type == "tabular" else 'gray' for model in models]
+        ax.barh(y_pos, scores, width, color=colors)
+    ax.set_yticks(y_pos)
+    ax.set_yticklabels(models)
+    ax.set_xlabel('Score (1-10)')
+    ax.set_title(f"{('Nexa Mistral Sci-7B Evaluation: ' if data_type == 'mistral' else '')}{domain}")
+    ax.set_xlim(0, 10)
+    if data_type == "mistral":
+        ax.legend()
+    ax.grid(True, axis='x', linestyle='--', alpha=0.7)
+    plt.tight_layout()
     return fig
+# Display functions
 def display_tabular_eval(domain):
+    return plot_comparison(domain, "tabular")
 def display_llm_eval(domain):
+    return plot_comparison(domain, "llm")
 def display_mistral_eval(metric):
+    return plot_comparison(metric, "mistral")
+# Gradio interface
+with gr.Blocks(css="body {font-family: 'Inter', sans-serif; background-color: #e0e0e0; color: #333;}") as demo:
     gr.Markdown("""
     # 🔬 Nexa Evals — Scientific ML Benchmark Suite
+    A benchmarking suite for Nexa models across various domains.
     """)
     with gr.Tabs():
                 )
                 show_tabular_btn = gr.Button("Show Evaluation")
             tabular_plot = gr.Plot(label="Benchmark Plot")
             show_tabular_btn.click(
                 fn=display_tabular_eval,
                 inputs=tabular_domain,
+                outputs=tabular_plot
             )
         with gr.TabItem("LLMs"):
                 )
                 show_llm_btn = gr.Button("Show Evaluation")
             llm_plot = gr.Plot(label="Benchmark Plot")
             show_llm_btn.click(
                 fn=display_llm_eval,
                 inputs=llm_domain,
+                outputs=llm_plot
             )
         with gr.TabItem("Nexa Mistral Sci-7B"):
                 )
                 show_mistral_btn = gr.Button("Show Evaluation")
             mistral_plot = gr.Plot(label="Benchmark Plot")
             show_mistral_btn.click(
                 fn=display_mistral_eval,
                 inputs=mistral_metric,
+                outputs=mistral_plot
             )
+    with gr.TabItem("About"):
+        gr.Markdown("""
+        # ℹ️ About Nexa Evals
+        Nexa Evals benchmarks Nexa models across scientific domains:
+        - **Tabular Models**: Compares Nexa models against baselines.
+        - **LLMs**: Evaluates Nexa language models against competitors.
+        - **Nexa Mistral Sci-7B**: Compares general and physics-specific performance.
+        Scores are on a 1-10 scale.
+        """)
 demo.launch()