NexaEvals / app.py
Allanatrix's picture
Create app.py
16d37fe verified
raw
history blame
7.09 kB
import gradio as gr
import pandas as pd
import matplotlib.pyplot as plt
# ─── 1. BENCHMARK DATA ──────────────────────────────────────────────────────────
# Nested dict: Domain β†’ { Model Name β†’ {metric_name: value, …, "SOTA_<metric>": value } }
benchmark_data = {
"Protein Folding": {
"Nexa Bio1 (Secondary)": {
"Accuracy (%)": 71,
"Q3 (%)": 65,
"Q8 (%)": 55,
"TM-score": 0.60,
"SOTA_Accuracy (%)": 85,
"SOTA_TM-score": 0.75
},
"Nexa Bio2 (Tertiary)": {
"Confidence (%)": 90,
"GDT_TS": 0.82,
"Entropy Threshold (%)": 80,
"SOTA_Confidence (%)": 92,
"SOTA_GDT_TS": 0.85
},
},
"Astrophysics": {
"Nexa Astro": {
"Accuracy (%)": 97,
"Macro-F1 (%)": 96,
"ROC-AUC": 0.98,
"SOTA_Accuracy (%)": 96,
"SOTA_ROC-AUC": 0.97
},
},
"Materials Science": {
"Nexa MatSci": {
"MAE (eV)": 0.02,
"RMSE (eV)": 0.03,
"Bandgap Accuracy (%)": 98,
"SOTA_MAE (eV)": 0.03,
"SOTA_Bandgap Accuracy (%)": 95
},
},
"Quantum State Tomography": {
"Nexa QST": {
"Fidelity": 0.80,
"Purity": 1.00,
"Trace Distance": 0.15,
"SOTA_Fidelity": 0.83,
"SOTA_Trace Distance": 0.12
},
},
"Computational Fluid Dynamics": {
"Nexa CFD": {
"Relative L2 Error": 0.015,
"Energy Conservation Loss": 0.005,
"PSNR": 30,
"SSIM": 0.88,
"SOTA_Relative L2 Error": 0.020,
"SOTA_SSIM": 0.85
},
},
"High-Energy Physics": {
"Nexa HEP": {
"ROC-AUC": 0.92,
"Event Accuracy (%)": 90,
"Jet Tagging (%)": 88,
"SOTA_ROC-AUC": 0.93,
"SOTA_Event Accuracy (%)": 89
},
},
"LLM Hypothesis & Methodology": {
"Nexa MOE": {
"Coherence (1–10)": 9.1,
"Novelty (1–10)": 8.6,
"Utility (1–10)": 8.8,
"Expert-Rated SOTA (1–10)": 9.0
},
},
}
# ─── 2. SECTION DESCRIPTIONS ───────────────────────────────────────────────────
section_descriptions = {
"Protein Folding": """**Protein Folding**
Benchmarks for secondary (Q3/Q8) and tertiary (TM-score) structure prediction.
Nexa Bio1 handles sequence→secondary, Nexa Bio2 handles full 3D fold confidence.""",
"Astrophysics": """**Astrophysics**
Stellar classification and redshift estimation.
Metrics: Accuracy, F1, ROC-AUC against SDSS-Net and astroML baselines.""",
"Materials Science": """**Materials Science**
Property prediction for novel materials (e.g., bandgap, formation energy).
Metrics: MAE/RMSE, bandgap‐prediction accuracy vs. CGCNN, ALIGNN.""",
"Quantum State Tomography": """**Quantum State Tomography**
Reconstruct quantum states from measurement data.
Metrics: Fidelity, Purity, Trace Distance against PINNs and QuNet.""",
"Computational Fluid Dynamics": """**CFD**
Flow field prediction (Navier–Stokes).
Metrics: Relative L2 Error, PSNR/SSIM, Energy Conservation Loss vs. FNO.""",
"High-Energy Physics": """**High-Energy Physics**
Particle classification and signal/background separation.
Metrics: ROC-AUC, event reconstruction accuracy, jet-tagging efficiency.""",
"LLM Hypothesis & Methodology": """**LLM-Based Scientific Reasoning**
Hypothesis and methodology generation.
Metrics scored 1–10 by expert rubric on Coherence, Novelty, and Utility; compared to top academic LLM baselines."""
}
# ─── 3. PLOTTING FUNCTION ────────────────────────────────────────────────────────
def plot_comparison(category):
data = benchmark_data[category]
fig, ax = plt.subplots(figsize=(7, 4))
bar_width = 0.4
indices = list(range(len(data)))
labels = list(data.keys())
# collect metrics that aren’t SOTA
for i, model in enumerate(labels):
metrics = data[model]
# extract non-SOTA metrics
non_sota = {k: v for k, v in metrics.items() if not k.startswith("SOTA")}
sota = {k.replace("SOTA_", ""): v for k, v in metrics.items() if k.startswith("SOTA")}
# bar positions
pos = i * 2
ax.bar([pos + j*bar_width for j in range(len(non_sota))],
list(non_sota.values()),
width=bar_width, label=f"{model} Metrics")
if sota:
ax.bar([pos + bar_width*len(non_sota) + j*bar_width for j in range(len(sota))],
list(sota.values()),
width=bar_width, alpha=0.7, label=f"{model} SOTA")
# formatting
ax.set_xticks([i * (2) + bar_width*(len(non_sota)/2) for i in indices])
ax.set_xticklabels(labels, rotation=45, ha='right')
ax.set_ylabel("Value / Score")
ax.set_title(f"{category} β€” Nexa vs. SOTA")
ax.legend(loc="upper right")
plt.tight_layout()
return fig
# ─── 4. CALLBACK TO RENDER SECTION ─────────────────────────────────────────────
def show_eval(category):
desc = section_descriptions[category]
df = pd.DataFrame(benchmark_data[category]).T
fig = plot_comparison(category)
return desc, df, fig
# ─── 5. BUILD GRADIO APP ───────────────────────────────────────────────────────
with gr.Blocks(css="""
body { background-color: #f7f9fc; font-family: Arial, sans-serif; }
.gradio-container { max-width: 900px; margin: auto; }
h1, h2, h3 { color: #333; }
""") as app:
gr.Markdown("# πŸ”¬ Nexa Evals Dashboard")
gr.Markdown("A **comprehensive** SciML benchmark framework. Select a domain to view metrics, compare with SOTA, and explore detailed plots and tables.")
with gr.Row():
with gr.Column(scale=1):
category = gr.Radio(
choices=list(benchmark_data.keys()),
value="Protein Folding",
label="Select Domain / Model Group"
)
with gr.Column(scale=3):
description = gr.Markdown("")
table = gr.Dataframe(headers=["Metric", "Value"], interactive=False)
plot = gr.Plot()
category.change(
fn=show_eval,
inputs=category,
outputs=[description, table, plot]
)
# initialize
description.value, table.value, _ = show_eval("Protein Folding")
# Launch (on Hugging Face the config flags will be auto-managed)
app.launch()