File size: 4,904 Bytes
0bbd367
 
b0ad3dc
0bbd367
b0ad3dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0bbd367
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b0ad3dc
 
 
0bbd367
 
 
 
 
 
 
b0ad3dc
0bbd367
 
 
b0ad3dc
 
0bbd367
 
 
 
 
 
 
 
b0ad3dc
 
 
 
 
 
 
 
 
 
 
 
 
 
0bbd367
b0ad3dc
0bbd367
 
b0ad3dc
 
0bbd367
 
b0ad3dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0bbd367
b0ad3dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0bbd367
 
 
 
b0ad3dc
 
 
 
0bbd367
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import gradio as gr
import plotly.graph_objects as go
import json

# Data for tabular models
TABULAR_MODEL_EVALS = {
    "Proteins": {
        "Nexa Bio1 (Secondary)": 0.71,
        "Porter6 (Secondary)": 0.8456,
        "DeepCNF (Secondary)": 0.85,
        "AlphaFold2 (Tertiary GDT-TS)": 0.924,
        "Nexa Bio2 (Tertiary)": 0.90,
    },
    "Astro": {
        "Nexa Astro": 0.97,
        "Baseline CNN": 0.89,
    },
    "Materials": {
        "Nexa Materials": 0.9999,
        "Random Forest Baseline": 0.92,
    },
    "QST": {
        "Nexa PIN Model": 0.80,
        "Quantum TomoNet": 0.85,
    },
    "HEP": {
        "Nexa HEP Model": 0.91,
        "CMSNet": 0.94,
    },
    "CFD": {
        "Nexa CFD Model": 0.92,
        "FlowNet": 0.89,
    },
}

# Data for LLMs
LLM_MODEL_EVALS = {
    "LLM (General OSIR)": {
        "Nexa Mistral Sci-7B": 0.61,
        "Llama-3-8B-Instruct": 0.39,
        "Mixtral-8x7B-Instruct-v0.1": 0.41,
        "Claude-3-Sonnet": 0.64,
        "GPT-4-Turbo": 0.68,
        "GPT-4o": 0.71,
    },
    "LLM (Field-Specific OSIR)": {
        "Nexa Bio Adapter": 0.66,
        "Nexa Astro Adapter": 0.70,
        "GPT-4o (Biomed)": 0.69,
        "Claude-3-Opus (Bio)": 0.67,
        "Llama-3-8B-Bio": 0.42,
        "Mixtral-8x7B-BioTune": 0.43,
    },
}

# Universal plotting function for horizontal bar charts
def plot_horizontal_bar(domain, data, color):
    sorted_items = sorted(data.items(), key=lambda x: x[1], reverse=True)
    models, scores = zip(*sorted_items)

    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=scores,
        y=models,
        orientation='h',
        marker_color=color,
    ))

    fig.update_layout(
        title=f"Model Benchmark Scores — {domain}",
        xaxis_title="Score",
        yaxis_title="Model",
        xaxis_range=[0, 1.0],
        template="plotly_white",
        height=500,
        margin=dict(l=120, r=20, t=40, b=40),
    )
    return fig

# Display functions for each section
def display_tabular_eval(domain):
    if domain not in TABULAR_MODEL_EVALS:
        return None, "Invalid domain selected"
    plot = plot_horizontal_bar(domain, TABULAR_MODEL_EVALS[domain], 'indigo')
    details = json.dumps(TABULAR_MODEL_EVALS[domain], indent=2)
    return plot, details

def display_llm_eval(domain):
    if domain not in LLM_MODEL_EVALS:
        return None, "Invalid domain selected"
    plot = plot_horizontal_bar(domain, LLM_MODEL_EVALS[domain], 'lightblue')
    details = json.dumps(LLM_MODEL_EVALS[domain], indent=2)
    return plot, details

# Gradio interface
with gr.Blocks(css="body {font-family: 'Inter', sans-serif; background-color: #fafafa;}") as demo:
    gr.Markdown("""
    # 🔬 Nexa Evals — Scientific ML Benchmark Suite
    A comprehensive benchmarking suite comparing Nexa models against state-of-the-art models across scientific domains and language models.
    """)

    with gr.Tabs():
        with gr.TabItem("Tabular Models"):
            with gr.Row():
                tabular_domain = gr.Dropdown(
                    choices=list(TABULAR_MODEL_EVALS.keys()),
                    label="Select Domain",
                    value="Proteins"
                )
                show_tabular_btn = gr.Button("Show Evaluation")
            tabular_plot = gr.Plot(label="Benchmark Plot")
            tabular_details = gr.Code(label="Raw Scores (JSON)", language="json")
            show_tabular_btn.click(
                fn=display_tabular_eval,
                inputs=tabular_domain,
                outputs=[tabular_plot, tabular_details]
            )

        with gr.TabItem("LLMs"):
            with gr.Row():
                llm_domain = gr.Dropdown(
                    choices=list(LLM_MODEL_EVALS.keys()),
                    label="Select Domain",
                    value="LLM (General OSIR)"
                )
                show_llm_btn = gr.Button("Show Evaluation")
            llm_plot = gr.Plot(label="Benchmark Plot")
            llm_details = gr.Code(label="Raw Scores (JSON)", language="json")
            show_llm_btn.click(
                fn=display_llm_eval,
                inputs=llm_domain,
                outputs=[llm_plot, llm_details]
            )

    gr.Markdown("""
    ---
    ### ℹ️ About
    Nexa Evals provides benchmarks for both tabular models and language models in scientific domains:
    - **Tabular Models**: Evaluated on domain-specific metrics (e.g., accuracy, GDT-TS) across fields like Proteins, Astro, Materials, QST, HEP, and CFD.
    - **Language Models**: Assessed using the SciEval benchmark under the OSIR initiative, focusing on scientific utility, information entropy, internal consistency, hypothesis framing, domain grounding, and math logic.
    Scores range from 0 to 1, with higher values indicating better performance. Models are sorted by score in descending order for easy comparison.
    """)

demo.launch()