Spaces:

Allanatrix
/

NexaEvals

Sleeping

App Files Files Community

NexaEvals / app.py

Allanatrix

Update app.py

08d1f1b verified about 1 month ago

raw

history blame

5.63 kB

	import gradio as gr
	import matplotlib.pyplot as plt
	import numpy as np

	# Data for Tabular Models (normalized to 0-10 from original 0-1 data)
	TABULAR_MODEL_EVALS = {
	"Proteins": {
	"Nexa Bio1 (Secondary)": 7.1,
	"Porter6 (Secondary)": 8.5,
	"DeepCNF (Secondary)": 8.5,
	"AlphaFold2 (Tertiary GDT-TS)": 9.2,
	"Nexa Bio2 (Tertiary)": 9.0,
	},
	"Astro": {
	"Nexa Astro": 9.7,
	"Baseline CNN": 8.9,
	},
	"Materials": {
	"Nexa Materials": 10.0,
	"Random Forest Baseline": 9.2,
	},
	"QST": {
	"Nexa PIN Model": 8.0,
	"Quantum TomoNet": 8.5,
	},
	"HEP": {
	"Nexa HEP Model": 9.1,
	"CMSNet": 9.4,
	},
	"CFD": {
	"Nexa CFD Model": 9.2,
	"FlowNet": 8.9,
	},
	}

	# Data for Nexa Mistral Sci-7B Evaluation (from your image)
	NEXA_MISTRAL_EVALS = {
	"Nexa Mistral Sci-7B": {
	"Scientific Utility": {"OSIR (General)": 7.0, "OSIR-Field (Physics)": 8.5},
	"Symbolism & Math Logic": {"OSIR (General)": 6.0, "OSIR-Field (Physics)": 7.5},
	"Citation & Structure": {"OSIR (General)": 5.5, "OSIR-Field (Physics)": 6.0},
	"Thematic Grounding": {"OSIR (General)": 7.0, "OSIR-Field (Physics)": 8.0},
	"Hypothesis Framing": {"OSIR (General)": 6.0, "OSIR-Field (Physics)": 7.0},
	"Internal Consistency": {"OSIR (General)": 9.0, "OSIR-Field (Physics)": 9.5},
	"Entropy / Novelty": {"OSIR (General)": 6.5, "OSIR-Field (Physics)": 6.0},
	}
	}

	# Plotting function using Matplotlib
	def plot_comparison(domain, data_type):
	if data_type == "mistral":
	metric = domain
	data = NEXA_MISTRAL_EVALS["Nexa Mistral Sci-7B"][metric]
	models = list(data.keys())
	scores = list(data.values())
	fig, ax = plt.subplots(figsize=(8, 6), facecolor='#e0e0e0')
	y_pos = np.arange(len(models))
	width = 0.35
	ax.barh(y_pos - width/2, scores[:1], width, label=models[0], color='yellow')
	ax.barh(y_pos + width/2, scores[1:], width, label=models[1], color='orange')
	else:
	data = TABULAR_MODEL_EVALS[domain] if data_type == "tabular" else LLM_MODEL_EVALS[domain]
	models = list(data.keys())
	scores = list(data.values())
	fig, ax = plt.subplots(figsize=(8, 6), facecolor='#e0e0e0')
	y_pos = np.arange(len(models))
	width = 0.8
	colors = ['indigo' if 'Nexa' in model else 'lightgray' if data_type == "tabular" else 'gray' for model in models]
	ax.barh(y_pos, scores, width, color=colors)

	ax.set_yticks(y_pos)
	ax.set_yticklabels(models)
	ax.set_xlabel('Score (1-10)')
	ax.set_title(f"{('Nexa Mistral Sci-7B Evaluation: ' if data_type == 'mistral' else '')}{domain}")
	ax.set_xlim(0, 10)
	if data_type == "mistral":
	ax.legend()
	ax.grid(True, axis='x', linestyle='--', alpha=0.7)
	plt.tight_layout()

	return fig

	# Display functions
	def display_tabular_eval(domain):
	return plot_comparison(domain, "tabular")

	def display_llm_eval(domain):
	return plot_comparison(domain, "llm")

	def display_mistral_eval(metric):
	return plot_comparison(metric, "mistral")

	# Gradio interface
	with gr.Blocks(css="body {font-family: 'Inter', sans-serif; background-color: #e0e0e0; color: #333;}") as demo:
	gr.Markdown("""
	# 🔬 Nexa Evals — Scientific ML Benchmark Suite
	A benchmarking suite for Nexa models across various domains.
	""")

	with gr.Tabs():
	with gr.TabItem("Tabular Models"):
	with gr.Row():
	tabular_domain = gr.Dropdown(
	choices=list(TABULAR_MODEL_EVALS.keys()),
	label="Select Domain",
	value="Proteins"
	)
	show_tabular_btn = gr.Button("Show Evaluation")
	tabular_plot = gr.Plot(label="Benchmark Plot")
	show_tabular_btn.click(
	fn=display_tabular_eval,
	inputs=tabular_domain,
	outputs=tabular_plot
	)

	with gr.TabItem("LLMs"):
	with gr.Row():
	llm_domain = gr.Dropdown(
	choices=list(LLM_MODEL_EVALS.keys()),
	label="Select Domain",
	value="LLM (General OSIR)"
	)
	show_llm_btn = gr.Button("Show Evaluation")
	llm_plot = gr.Plot(label="Benchmark Plot")
	show_llm_btn.click(
	fn=display_llm_eval,
	inputs=llm_domain,
	outputs=llm_plot
	)

	with gr.TabItem("Nexa Mistral Sci-7B"):
	with gr.Row():
	mistral_metric = gr.Dropdown(
	choices=list(NEXA_MISTRAL_EVALS["Nexa Mistral Sci-7B"].keys()),
	label="Select Metric",
	value="Scientific Utility"
	)
	show_mistral_btn = gr.Button("Show Evaluation")
	mistral_plot = gr.Plot(label="Benchmark Plot")
	show_mistral_btn.click(
	fn=display_mistral_eval,
	inputs=mistral_metric,
	outputs=mistral_plot
	)

	with gr.TabItem("About"):
	gr.Markdown("""
	# ℹ️ About Nexa Evals
	Nexa Evals benchmarks Nexa models across scientific domains:
	- Tabular Models: Compares Nexa models against baselines.
	- LLMs: Evaluates Nexa language models against competitors.
	- Nexa Mistral Sci-7B: Compares general and physics-specific performance.
	Scores are on a 1-10 scale.
	""")

	demo.launch()