Spaces:
Sleeping
Sleeping
import gradio as gr | |
import matplotlib.pyplot as plt | |
import numpy as np | |
# Data for Tabular Models (normalized to 0-10 from original 0-1 data) | |
TABULAR_MODEL_EVALS = { | |
"Proteins": { | |
"Nexa Bio1 (Secondary)": 7.1, | |
"Porter6 (Secondary)": 8.5, | |
"DeepCNF (Secondary)": 8.5, | |
"AlphaFold2 (Tertiary GDT-TS)": 9.2, | |
"Nexa Bio2 (Tertiary)": 9.0, | |
}, | |
"Astro": { | |
"Nexa Astro": 9.7, | |
"Baseline CNN": 8.9, | |
}, | |
"Materials": { | |
"Nexa Materials": 10.0, | |
"Random Forest Baseline": 9.2, | |
}, | |
"QST": { | |
"Nexa PIN Model": 8.0, | |
"Quantum TomoNet": 8.5, | |
}, | |
"HEP": { | |
"Nexa HEP Model": 9.1, | |
"CMSNet": 9.4, | |
}, | |
"CFD": { | |
"Nexa CFD Model": 9.2, | |
"FlowNet": 8.9, | |
}, | |
} | |
# Data for Nexa Mistral Sci-7B Evaluation (from your image) | |
NEXA_MISTRAL_EVALS = { | |
"Nexa Mistral Sci-7B": { | |
"Scientific Utility": {"OSIR (General)": 7.0, "OSIR-Field (Physics)": 8.5}, | |
"Symbolism & Math Logic": {"OSIR (General)": 6.0, "OSIR-Field (Physics)": 7.5}, | |
"Citation & Structure": {"OSIR (General)": 5.5, "OSIR-Field (Physics)": 6.0}, | |
"Thematic Grounding": {"OSIR (General)": 7.0, "OSIR-Field (Physics)": 8.0}, | |
"Hypothesis Framing": {"OSIR (General)": 6.0, "OSIR-Field (Physics)": 7.0}, | |
"Internal Consistency": {"OSIR (General)": 9.0, "OSIR-Field (Physics)": 9.5}, | |
"Entropy / Novelty": {"OSIR (General)": 6.5, "OSIR-Field (Physics)": 6.0}, | |
} | |
} | |
# Plotting function using Matplotlib | |
def plot_comparison(domain, data_type): | |
if data_type == "mistral": | |
metric = domain | |
data = NEXA_MISTRAL_EVALS["Nexa Mistral Sci-7B"][metric] | |
models = list(data.keys()) | |
scores = list(data.values()) | |
fig, ax = plt.subplots(figsize=(8, 6), facecolor='#e0e0e0') | |
y_pos = np.arange(len(models)) | |
width = 0.35 | |
ax.barh(y_pos - width/2, scores[:1], width, label=models[0], color='yellow') | |
ax.barh(y_pos + width/2, scores[1:], width, label=models[1], color='orange') | |
else: | |
data = TABULAR_MODEL_EVALS[domain] if data_type == "tabular" else LLM_MODEL_EVALS[domain] | |
models = list(data.keys()) | |
scores = list(data.values()) | |
fig, ax = plt.subplots(figsize=(8, 6), facecolor='#e0e0e0') | |
y_pos = np.arange(len(models)) | |
width = 0.8 | |
colors = ['indigo' if 'Nexa' in model else 'lightgray' if data_type == "tabular" else 'gray' for model in models] | |
ax.barh(y_pos, scores, width, color=colors) | |
ax.set_yticks(y_pos) | |
ax.set_yticklabels(models) | |
ax.set_xlabel('Score (1-10)') | |
ax.set_title(f"{('Nexa Mistral Sci-7B Evaluation: ' if data_type == 'mistral' else '')}{domain}") | |
ax.set_xlim(0, 10) | |
if data_type == "mistral": | |
ax.legend() | |
ax.grid(True, axis='x', linestyle='--', alpha=0.7) | |
plt.tight_layout() | |
return fig | |
# Display functions | |
def display_tabular_eval(domain): | |
return plot_comparison(domain, "tabular") | |
def display_llm_eval(domain): | |
return plot_comparison(domain, "llm") | |
def display_mistral_eval(metric): | |
return plot_comparison(metric, "mistral") | |
# Gradio interface | |
with gr.Blocks(css="body {font-family: 'Inter', sans-serif; background-color: #e0e0e0; color: #333;}") as demo: | |
gr.Markdown(""" | |
# π¬ Nexa Evals β Scientific ML Benchmark Suite | |
A benchmarking suite for Nexa models across various domains. | |
""") | |
with gr.Tabs(): | |
with gr.TabItem("Tabular Models"): | |
with gr.Row(): | |
tabular_domain = gr.Dropdown( | |
choices=list(TABULAR_MODEL_EVALS.keys()), | |
label="Select Domain", | |
value="Proteins" | |
) | |
show_tabular_btn = gr.Button("Show Evaluation") | |
tabular_plot = gr.Plot(label="Benchmark Plot") | |
show_tabular_btn.click( | |
fn=display_tabular_eval, | |
inputs=tabular_domain, | |
outputs=tabular_plot | |
) | |
with gr.TabItem("LLMs"): | |
with gr.Row(): | |
llm_domain = gr.Dropdown( | |
choices=list(LLM_MODEL_EVALS.keys()), | |
label="Select Domain", | |
value="LLM (General OSIR)" | |
) | |
show_llm_btn = gr.Button("Show Evaluation") | |
llm_plot = gr.Plot(label="Benchmark Plot") | |
show_llm_btn.click( | |
fn=display_llm_eval, | |
inputs=llm_domain, | |
outputs=llm_plot | |
) | |
with gr.TabItem("Nexa Mistral Sci-7B"): | |
with gr.Row(): | |
mistral_metric = gr.Dropdown( | |
choices=list(NEXA_MISTRAL_EVALS["Nexa Mistral Sci-7B"].keys()), | |
label="Select Metric", | |
value="Scientific Utility" | |
) | |
show_mistral_btn = gr.Button("Show Evaluation") | |
mistral_plot = gr.Plot(label="Benchmark Plot") | |
show_mistral_btn.click( | |
fn=display_mistral_eval, | |
inputs=mistral_metric, | |
outputs=mistral_plot | |
) | |
with gr.TabItem("About"): | |
gr.Markdown(""" | |
# βΉοΈ About Nexa Evals | |
Nexa Evals benchmarks Nexa models across scientific domains: | |
- **Tabular Models**: Compares Nexa models against baselines. | |
- **LLMs**: Evaluates Nexa language models against competitors. | |
- **Nexa Mistral Sci-7B**: Compares general and physics-specific performance. | |
Scores are on a 1-10 scale. | |
""") | |
demo.launch() | |