Spaces:

Allanatrix
/

NexaEvals

Sleeping

File size: 5,631 Bytes

0bbd367
08d1f1b
 
0bbd367
08d1f1b
b0ad3dc
 
08d1f1b
 
 
 
 
b0ad3dc
 
08d1f1b
 
b0ad3dc
 
08d1f1b
 
b0ad3dc
 
08d1f1b
 
b0ad3dc
 
08d1f1b
 
b0ad3dc
 
08d1f1b
 
b0ad3dc
 
 
08d1f1b
94c2f22
 
 
 
 
 
 
 
 
 
 
 
08d1f1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0bbd367
94c2f22
 
08d1f1b
b0ad3dc
08d1f1b
b0ad3dc
 
08d1f1b
0bbd367
94c2f22
08d1f1b
94c2f22
08d1f1b
 
0bbd367
b0ad3dc
08d1f1b
0bbd367
 
b0ad3dc
 
 
 
 
 
 
 
 
 
 
 
 
08d1f1b
b0ad3dc
0bbd367
b0ad3dc
 
 
 
 
 
 
 
 
 
 
 
08d1f1b
b0ad3dc
0bbd367
94c2f22
 
 
 
 
 
 
 
 
 
 
 
08d1f1b
94c2f22
 
08d1f1b
 
 
 
 
 
 
 
 
0bbd367

import gradio as gr
import matplotlib.pyplot as plt
import numpy as np

# Data for Tabular Models (normalized to 0-10 from original 0-1 data)
TABULAR_MODEL_EVALS = {
    "Proteins": {
        "Nexa Bio1 (Secondary)": 7.1,
        "Porter6 (Secondary)": 8.5,
        "DeepCNF (Secondary)": 8.5,
        "AlphaFold2 (Tertiary GDT-TS)": 9.2,
        "Nexa Bio2 (Tertiary)": 9.0,
    },
    "Astro": {
        "Nexa Astro": 9.7,
        "Baseline CNN": 8.9,
    },
    "Materials": {
        "Nexa Materials": 10.0,
        "Random Forest Baseline": 9.2,
    },
    "QST": {
        "Nexa PIN Model": 8.0,
        "Quantum TomoNet": 8.5,
    },
    "HEP": {
        "Nexa HEP Model": 9.1,
        "CMSNet": 9.4,
    },
    "CFD": {
        "Nexa CFD Model": 9.2,
        "FlowNet": 8.9,
    },
}

# Data for Nexa Mistral Sci-7B Evaluation (from your image)
NEXA_MISTRAL_EVALS = {
    "Nexa Mistral Sci-7B": {
        "Scientific Utility": {"OSIR (General)": 7.0, "OSIR-Field (Physics)": 8.5},
        "Symbolism & Math Logic": {"OSIR (General)": 6.0, "OSIR-Field (Physics)": 7.5},
        "Citation & Structure": {"OSIR (General)": 5.5, "OSIR-Field (Physics)": 6.0},
        "Thematic Grounding": {"OSIR (General)": 7.0, "OSIR-Field (Physics)": 8.0},
        "Hypothesis Framing": {"OSIR (General)": 6.0, "OSIR-Field (Physics)": 7.0},
        "Internal Consistency": {"OSIR (General)": 9.0, "OSIR-Field (Physics)": 9.5},
        "Entropy / Novelty": {"OSIR (General)": 6.5, "OSIR-Field (Physics)": 6.0},
    }
}

# Plotting function using Matplotlib
def plot_comparison(domain, data_type):
    if data_type == "mistral":
        metric = domain
        data = NEXA_MISTRAL_EVALS["Nexa Mistral Sci-7B"][metric]
        models = list(data.keys())
        scores = list(data.values())
        fig, ax = plt.subplots(figsize=(8, 6), facecolor='#e0e0e0')
        y_pos = np.arange(len(models))
        width = 0.35
        ax.barh(y_pos - width/2, scores[:1], width, label=models[0], color='yellow')
        ax.barh(y_pos + width/2, scores[1:], width, label=models[1], color='orange')
    else:
        data = TABULAR_MODEL_EVALS[domain] if data_type == "tabular" else LLM_MODEL_EVALS[domain]
        models = list(data.keys())
        scores = list(data.values())
        fig, ax = plt.subplots(figsize=(8, 6), facecolor='#e0e0e0')
        y_pos = np.arange(len(models))
        width = 0.8
        colors = ['indigo' if 'Nexa' in model else 'lightgray' if data_type == "tabular" else 'gray' for model in models]
        ax.barh(y_pos, scores, width, color=colors)

    ax.set_yticks(y_pos)
    ax.set_yticklabels(models)
    ax.set_xlabel('Score (1-10)')
    ax.set_title(f"{('Nexa Mistral Sci-7B Evaluation: ' if data_type == 'mistral' else '')}{domain}")
    ax.set_xlim(0, 10)
    if data_type == "mistral":
        ax.legend()
    ax.grid(True, axis='x', linestyle='--', alpha=0.7)
    plt.tight_layout()

    return fig

# Display functions
def display_tabular_eval(domain):
    return plot_comparison(domain, "tabular")

def display_llm_eval(domain):
    return plot_comparison(domain, "llm")

def display_mistral_eval(metric):
    return plot_comparison(metric, "mistral")

# Gradio interface
with gr.Blocks(css="body {font-family: 'Inter', sans-serif; background-color: #e0e0e0; color: #333;}") as demo:
    gr.Markdown("""
    # 🔬 Nexa Evals — Scientific ML Benchmark Suite
    A benchmarking suite for Nexa models across various domains.
    """)

    with gr.Tabs():
        with gr.TabItem("Tabular Models"):
            with gr.Row():
                tabular_domain = gr.Dropdown(
                    choices=list(TABULAR_MODEL_EVALS.keys()),
                    label="Select Domain",
                    value="Proteins"
                )
                show_tabular_btn = gr.Button("Show Evaluation")
            tabular_plot = gr.Plot(label="Benchmark Plot")
            show_tabular_btn.click(
                fn=display_tabular_eval,
                inputs=tabular_domain,
                outputs=tabular_plot
            )

        with gr.TabItem("LLMs"):
            with gr.Row():
                llm_domain = gr.Dropdown(
                    choices=list(LLM_MODEL_EVALS.keys()),
                    label="Select Domain",
                    value="LLM (General OSIR)"
                )
                show_llm_btn = gr.Button("Show Evaluation")
            llm_plot = gr.Plot(label="Benchmark Plot")
            show_llm_btn.click(
                fn=display_llm_eval,
                inputs=llm_domain,
                outputs=llm_plot
            )

        with gr.TabItem("Nexa Mistral Sci-7B"):
            with gr.Row():
                mistral_metric = gr.Dropdown(
                    choices=list(NEXA_MISTRAL_EVALS["Nexa Mistral Sci-7B"].keys()),
                    label="Select Metric",
                    value="Scientific Utility"
                )
                show_mistral_btn = gr.Button("Show Evaluation")
            mistral_plot = gr.Plot(label="Benchmark Plot")
            show_mistral_btn.click(
                fn=display_mistral_eval,
                inputs=mistral_metric,
                outputs=mistral_plot
            )

    with gr.TabItem("About"):
        gr.Markdown("""
        # ℹ️ About Nexa Evals
        Nexa Evals benchmarks Nexa models across scientific domains:
        - **Tabular Models**: Compares Nexa models against baselines.
        - **LLMs**: Evaluates Nexa language models against competitors.
        - **Nexa Mistral Sci-7B**: Compares general and physics-specific performance.
        Scores are on a 1-10 scale.
        """)

demo.launch()