Spaces:

Allanatrix
/

NexaEvals

Sleeping

App Files Files Community

NexaEvals / app.py

Allanatrix

Create app.py

16d37fe verified about 2 months ago

raw

history blame

7.09 kB

	import gradio as gr
	import pandas as pd
	import matplotlib.pyplot as plt

	# ─── 1. BENCHMARK DATA ──────────────────────────────────────────────────────────
	# Nested dict: Domain → { Model Name → {metric_name: value, …, "SOTA_<metric>": value } }
	benchmark_data = {
	"Protein Folding": {
	"Nexa Bio1 (Secondary)": {
	"Accuracy (%)": 71,
	"Q3 (%)": 65,
	"Q8 (%)": 55,
	"TM-score": 0.60,
	"SOTA_Accuracy (%)": 85,
	"SOTA_TM-score": 0.75
	},
	"Nexa Bio2 (Tertiary)": {
	"Confidence (%)": 90,
	"GDT_TS": 0.82,
	"Entropy Threshold (%)": 80,
	"SOTA_Confidence (%)": 92,
	"SOTA_GDT_TS": 0.85
	},
	},
	"Astrophysics": {
	"Nexa Astro": {
	"Accuracy (%)": 97,
	"Macro-F1 (%)": 96,
	"ROC-AUC": 0.98,
	"SOTA_Accuracy (%)": 96,
	"SOTA_ROC-AUC": 0.97
	},
	},
	"Materials Science": {
	"Nexa MatSci": {
	"MAE (eV)": 0.02,
	"RMSE (eV)": 0.03,
	"Bandgap Accuracy (%)": 98,
	"SOTA_MAE (eV)": 0.03,
	"SOTA_Bandgap Accuracy (%)": 95
	},
	},
	"Quantum State Tomography": {
	"Nexa QST": {
	"Fidelity": 0.80,
	"Purity": 1.00,
	"Trace Distance": 0.15,
	"SOTA_Fidelity": 0.83,
	"SOTA_Trace Distance": 0.12
	},
	},
	"Computational Fluid Dynamics": {
	"Nexa CFD": {
	"Relative L2 Error": 0.015,
	"Energy Conservation Loss": 0.005,
	"PSNR": 30,
	"SSIM": 0.88,
	"SOTA_Relative L2 Error": 0.020,
	"SOTA_SSIM": 0.85
	},
	},
	"High-Energy Physics": {
	"Nexa HEP": {
	"ROC-AUC": 0.92,
	"Event Accuracy (%)": 90,
	"Jet Tagging (%)": 88,
	"SOTA_ROC-AUC": 0.93,
	"SOTA_Event Accuracy (%)": 89
	},
	},
	"LLM Hypothesis & Methodology": {
	"Nexa MOE": {
	"Coherence (1–10)": 9.1,
	"Novelty (1–10)": 8.6,
	"Utility (1–10)": 8.8,
	"Expert-Rated SOTA (1–10)": 9.0
	},
	},
	}

	# ─── 2. SECTION DESCRIPTIONS ───────────────────────────────────────────────────
	section_descriptions = {
	"Protein Folding": """Protein Folding
	Benchmarks for secondary (Q3/Q8) and tertiary (TM-score) structure prediction.
	Nexa Bio1 handles sequence→secondary, Nexa Bio2 handles full 3D fold confidence.""",
	"Astrophysics": """Astrophysics
	Stellar classification and redshift estimation.
	Metrics: Accuracy, F1, ROC-AUC against SDSS-Net and astroML baselines.""",
	"Materials Science": """Materials Science
	Property prediction for novel materials (e.g., bandgap, formation energy).
	Metrics: MAE/RMSE, bandgap‐prediction accuracy vs. CGCNN, ALIGNN.""",
	"Quantum State Tomography": """Quantum State Tomography
	Reconstruct quantum states from measurement data.
	Metrics: Fidelity, Purity, Trace Distance against PINNs and QuNet.""",
	"Computational Fluid Dynamics": """CFD
	Flow field prediction (Navier–Stokes).
	Metrics: Relative L2 Error, PSNR/SSIM, Energy Conservation Loss vs. FNO.""",
	"High-Energy Physics": """High-Energy Physics
	Particle classification and signal/background separation.
	Metrics: ROC-AUC, event reconstruction accuracy, jet-tagging efficiency.""",
	"LLM Hypothesis & Methodology": """LLM-Based Scientific Reasoning
	Hypothesis and methodology generation.
	Metrics scored 1–10 by expert rubric on Coherence, Novelty, and Utility; compared to top academic LLM baselines."""
	}

	# ─── 3. PLOTTING FUNCTION ────────────────────────────────────────────────────────
	def plot_comparison(category):
	data = benchmark_data[category]
	fig, ax = plt.subplots(figsize=(7, 4))
	bar_width = 0.4
	indices = list(range(len(data)))
	labels = list(data.keys())

	# collect metrics that aren’t SOTA
	for i, model in enumerate(labels):
	metrics = data[model]
	# extract non-SOTA metrics
	non_sota = {k: v for k, v in metrics.items() if not k.startswith("SOTA")}
	sota = {k.replace("SOTA_", ""): v for k, v in metrics.items() if k.startswith("SOTA")}

	# bar positions
	pos = i * 2
	ax.bar([pos + j*bar_width for j in range(len(non_sota))],
	list(non_sota.values()),
	width=bar_width, label=f"{model} Metrics")
	if sota:
	ax.bar([pos + bar_widthlen(non_sota) + jbar_width for j in range(len(sota))],
	list(sota.values()),
	width=bar_width, alpha=0.7, label=f"{model} SOTA")

	# formatting
	ax.set_xticks([i * (2) + bar_width*(len(non_sota)/2) for i in indices])
	ax.set_xticklabels(labels, rotation=45, ha='right')
	ax.set_ylabel("Value / Score")
	ax.set_title(f"{category} — Nexa vs. SOTA")
	ax.legend(loc="upper right")
	plt.tight_layout()
	return fig

	# ─── 4. CALLBACK TO RENDER SECTION ─────────────────────────────────────────────
	def show_eval(category):
	desc = section_descriptions[category]
	df = pd.DataFrame(benchmark_data[category]).T
	fig = plot_comparison(category)
	return desc, df, fig

	# ─── 5. BUILD GRADIO APP ───────────────────────────────────────────────────────
	with gr.Blocks(css="""
	body { background-color: #f7f9fc; font-family: Arial, sans-serif; }
	.gradio-container { max-width: 900px; margin: auto; }
	h1, h2, h3 { color: #333; }
	""") as app:
	gr.Markdown("# 🔬 Nexa Evals Dashboard")
	gr.Markdown("A comprehensive SciML benchmark framework. Select a domain to view metrics, compare with SOTA, and explore detailed plots and tables.")

	with gr.Row():
	with gr.Column(scale=1):
	category = gr.Radio(
	choices=list(benchmark_data.keys()),
	value="Protein Folding",
	label="Select Domain / Model Group"
	)
	with gr.Column(scale=3):
	description = gr.Markdown("")
	table = gr.Dataframe(headers=["Metric", "Value"], interactive=False)
	plot = gr.Plot()

	category.change(
	fn=show_eval,
	inputs=category,
	outputs=[description, table, plot]
	)

	# initialize
	description.value, table.value, _ = show_eval("Protein Folding")

	# Launch (on Hugging Face the config flags will be auto-managed)
	app.launch()