NexaEvals / app.py
Allanatrix's picture
Update app.py
08d1f1b verified
raw
history blame
5.63 kB
import gradio as gr
import matplotlib.pyplot as plt
import numpy as np
# Data for Tabular Models (normalized to 0-10 from original 0-1 data)
TABULAR_MODEL_EVALS = {
"Proteins": {
"Nexa Bio1 (Secondary)": 7.1,
"Porter6 (Secondary)": 8.5,
"DeepCNF (Secondary)": 8.5,
"AlphaFold2 (Tertiary GDT-TS)": 9.2,
"Nexa Bio2 (Tertiary)": 9.0,
},
"Astro": {
"Nexa Astro": 9.7,
"Baseline CNN": 8.9,
},
"Materials": {
"Nexa Materials": 10.0,
"Random Forest Baseline": 9.2,
},
"QST": {
"Nexa PIN Model": 8.0,
"Quantum TomoNet": 8.5,
},
"HEP": {
"Nexa HEP Model": 9.1,
"CMSNet": 9.4,
},
"CFD": {
"Nexa CFD Model": 9.2,
"FlowNet": 8.9,
},
}
# Data for Nexa Mistral Sci-7B Evaluation (from your image)
NEXA_MISTRAL_EVALS = {
"Nexa Mistral Sci-7B": {
"Scientific Utility": {"OSIR (General)": 7.0, "OSIR-Field (Physics)": 8.5},
"Symbolism & Math Logic": {"OSIR (General)": 6.0, "OSIR-Field (Physics)": 7.5},
"Citation & Structure": {"OSIR (General)": 5.5, "OSIR-Field (Physics)": 6.0},
"Thematic Grounding": {"OSIR (General)": 7.0, "OSIR-Field (Physics)": 8.0},
"Hypothesis Framing": {"OSIR (General)": 6.0, "OSIR-Field (Physics)": 7.0},
"Internal Consistency": {"OSIR (General)": 9.0, "OSIR-Field (Physics)": 9.5},
"Entropy / Novelty": {"OSIR (General)": 6.5, "OSIR-Field (Physics)": 6.0},
}
}
# Plotting function using Matplotlib
def plot_comparison(domain, data_type):
if data_type == "mistral":
metric = domain
data = NEXA_MISTRAL_EVALS["Nexa Mistral Sci-7B"][metric]
models = list(data.keys())
scores = list(data.values())
fig, ax = plt.subplots(figsize=(8, 6), facecolor='#e0e0e0')
y_pos = np.arange(len(models))
width = 0.35
ax.barh(y_pos - width/2, scores[:1], width, label=models[0], color='yellow')
ax.barh(y_pos + width/2, scores[1:], width, label=models[1], color='orange')
else:
data = TABULAR_MODEL_EVALS[domain] if data_type == "tabular" else LLM_MODEL_EVALS[domain]
models = list(data.keys())
scores = list(data.values())
fig, ax = plt.subplots(figsize=(8, 6), facecolor='#e0e0e0')
y_pos = np.arange(len(models))
width = 0.8
colors = ['indigo' if 'Nexa' in model else 'lightgray' if data_type == "tabular" else 'gray' for model in models]
ax.barh(y_pos, scores, width, color=colors)
ax.set_yticks(y_pos)
ax.set_yticklabels(models)
ax.set_xlabel('Score (1-10)')
ax.set_title(f"{('Nexa Mistral Sci-7B Evaluation: ' if data_type == 'mistral' else '')}{domain}")
ax.set_xlim(0, 10)
if data_type == "mistral":
ax.legend()
ax.grid(True, axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
return fig
# Display functions
def display_tabular_eval(domain):
return plot_comparison(domain, "tabular")
def display_llm_eval(domain):
return plot_comparison(domain, "llm")
def display_mistral_eval(metric):
return plot_comparison(metric, "mistral")
# Gradio interface
with gr.Blocks(css="body {font-family: 'Inter', sans-serif; background-color: #e0e0e0; color: #333;}") as demo:
gr.Markdown("""
# πŸ”¬ Nexa Evals β€” Scientific ML Benchmark Suite
A benchmarking suite for Nexa models across various domains.
""")
with gr.Tabs():
with gr.TabItem("Tabular Models"):
with gr.Row():
tabular_domain = gr.Dropdown(
choices=list(TABULAR_MODEL_EVALS.keys()),
label="Select Domain",
value="Proteins"
)
show_tabular_btn = gr.Button("Show Evaluation")
tabular_plot = gr.Plot(label="Benchmark Plot")
show_tabular_btn.click(
fn=display_tabular_eval,
inputs=tabular_domain,
outputs=tabular_plot
)
with gr.TabItem("LLMs"):
with gr.Row():
llm_domain = gr.Dropdown(
choices=list(LLM_MODEL_EVALS.keys()),
label="Select Domain",
value="LLM (General OSIR)"
)
show_llm_btn = gr.Button("Show Evaluation")
llm_plot = gr.Plot(label="Benchmark Plot")
show_llm_btn.click(
fn=display_llm_eval,
inputs=llm_domain,
outputs=llm_plot
)
with gr.TabItem("Nexa Mistral Sci-7B"):
with gr.Row():
mistral_metric = gr.Dropdown(
choices=list(NEXA_MISTRAL_EVALS["Nexa Mistral Sci-7B"].keys()),
label="Select Metric",
value="Scientific Utility"
)
show_mistral_btn = gr.Button("Show Evaluation")
mistral_plot = gr.Plot(label="Benchmark Plot")
show_mistral_btn.click(
fn=display_mistral_eval,
inputs=mistral_metric,
outputs=mistral_plot
)
with gr.TabItem("About"):
gr.Markdown("""
# ℹ️ About Nexa Evals
Nexa Evals benchmarks Nexa models across scientific domains:
- **Tabular Models**: Compares Nexa models against baselines.
- **LLMs**: Evaluates Nexa language models against competitors.
- **Nexa Mistral Sci-7B**: Compares general and physics-specific performance.
Scores are on a 1-10 scale.
""")
demo.launch()