Spaces:
Sleeping
Sleeping
File size: 4,904 Bytes
0bbd367 b0ad3dc 0bbd367 b0ad3dc 0bbd367 b0ad3dc 0bbd367 b0ad3dc 0bbd367 b0ad3dc 0bbd367 b0ad3dc 0bbd367 b0ad3dc 0bbd367 b0ad3dc 0bbd367 b0ad3dc 0bbd367 b0ad3dc 0bbd367 b0ad3dc 0bbd367 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import gradio as gr
import plotly.graph_objects as go
import json
# Data for tabular models
TABULAR_MODEL_EVALS = {
"Proteins": {
"Nexa Bio1 (Secondary)": 0.71,
"Porter6 (Secondary)": 0.8456,
"DeepCNF (Secondary)": 0.85,
"AlphaFold2 (Tertiary GDT-TS)": 0.924,
"Nexa Bio2 (Tertiary)": 0.90,
},
"Astro": {
"Nexa Astro": 0.97,
"Baseline CNN": 0.89,
},
"Materials": {
"Nexa Materials": 0.9999,
"Random Forest Baseline": 0.92,
},
"QST": {
"Nexa PIN Model": 0.80,
"Quantum TomoNet": 0.85,
},
"HEP": {
"Nexa HEP Model": 0.91,
"CMSNet": 0.94,
},
"CFD": {
"Nexa CFD Model": 0.92,
"FlowNet": 0.89,
},
}
# Data for LLMs
LLM_MODEL_EVALS = {
"LLM (General OSIR)": {
"Nexa Mistral Sci-7B": 0.61,
"Llama-3-8B-Instruct": 0.39,
"Mixtral-8x7B-Instruct-v0.1": 0.41,
"Claude-3-Sonnet": 0.64,
"GPT-4-Turbo": 0.68,
"GPT-4o": 0.71,
},
"LLM (Field-Specific OSIR)": {
"Nexa Bio Adapter": 0.66,
"Nexa Astro Adapter": 0.70,
"GPT-4o (Biomed)": 0.69,
"Claude-3-Opus (Bio)": 0.67,
"Llama-3-8B-Bio": 0.42,
"Mixtral-8x7B-BioTune": 0.43,
},
}
# Universal plotting function for horizontal bar charts
def plot_horizontal_bar(domain, data, color):
sorted_items = sorted(data.items(), key=lambda x: x[1], reverse=True)
models, scores = zip(*sorted_items)
fig = go.Figure()
fig.add_trace(go.Bar(
x=scores,
y=models,
orientation='h',
marker_color=color,
))
fig.update_layout(
title=f"Model Benchmark Scores — {domain}",
xaxis_title="Score",
yaxis_title="Model",
xaxis_range=[0, 1.0],
template="plotly_white",
height=500,
margin=dict(l=120, r=20, t=40, b=40),
)
return fig
# Display functions for each section
def display_tabular_eval(domain):
if domain not in TABULAR_MODEL_EVALS:
return None, "Invalid domain selected"
plot = plot_horizontal_bar(domain, TABULAR_MODEL_EVALS[domain], 'indigo')
details = json.dumps(TABULAR_MODEL_EVALS[domain], indent=2)
return plot, details
def display_llm_eval(domain):
if domain not in LLM_MODEL_EVALS:
return None, "Invalid domain selected"
plot = plot_horizontal_bar(domain, LLM_MODEL_EVALS[domain], 'lightblue')
details = json.dumps(LLM_MODEL_EVALS[domain], indent=2)
return plot, details
# Gradio interface
with gr.Blocks(css="body {font-family: 'Inter', sans-serif; background-color: #fafafa;}") as demo:
gr.Markdown("""
# 🔬 Nexa Evals — Scientific ML Benchmark Suite
A comprehensive benchmarking suite comparing Nexa models against state-of-the-art models across scientific domains and language models.
""")
with gr.Tabs():
with gr.TabItem("Tabular Models"):
with gr.Row():
tabular_domain = gr.Dropdown(
choices=list(TABULAR_MODEL_EVALS.keys()),
label="Select Domain",
value="Proteins"
)
show_tabular_btn = gr.Button("Show Evaluation")
tabular_plot = gr.Plot(label="Benchmark Plot")
tabular_details = gr.Code(label="Raw Scores (JSON)", language="json")
show_tabular_btn.click(
fn=display_tabular_eval,
inputs=tabular_domain,
outputs=[tabular_plot, tabular_details]
)
with gr.TabItem("LLMs"):
with gr.Row():
llm_domain = gr.Dropdown(
choices=list(LLM_MODEL_EVALS.keys()),
label="Select Domain",
value="LLM (General OSIR)"
)
show_llm_btn = gr.Button("Show Evaluation")
llm_plot = gr.Plot(label="Benchmark Plot")
llm_details = gr.Code(label="Raw Scores (JSON)", language="json")
show_llm_btn.click(
fn=display_llm_eval,
inputs=llm_domain,
outputs=[llm_plot, llm_details]
)
gr.Markdown("""
---
### ℹ️ About
Nexa Evals provides benchmarks for both tabular models and language models in scientific domains:
- **Tabular Models**: Evaluated on domain-specific metrics (e.g., accuracy, GDT-TS) across fields like Proteins, Astro, Materials, QST, HEP, and CFD.
- **Language Models**: Assessed using the SciEval benchmark under the OSIR initiative, focusing on scientific utility, information entropy, internal consistency, hypothesis framing, domain grounding, and math logic.
Scores range from 0 to 1, with higher values indicating better performance. Models are sorted by score in descending order for easy comparison.
""")
demo.launch()
|