NexaEvals / app.py
Allanatrix's picture
Update app.py
e032949 verified
import gradio as gr
import plotly.graph_objs as go
import plotly.express as px
import pandas as pd
import json
# Domain-specific model evaluations
MODEL_EVALS = {
"Proteins": {
"Nexa Bio1 (Secondary)": 0.71,
"Porter6 (Secondary)": 0.8456,
"DeepCNF (Secondary)": 0.85,
"AlphaFold2 (Tertiary GDT-TS)": 0.924,
"Nexa Bio2 (Tertiary)": 0.90,
},
"Astro": {
"Nexa Astro": 0.97,
"Baseline CNN": 0.89,
},
"Materials": {
"Nexa Materials": 0.9999,
"Random Forest Baseline": 0.92,
},
"QST": {
"Nexa PIN Model": 0.80,
"Quantum TomoNet": 0.85,
},
"HEP": {
"Nexa HEP Model": 0.91,
"CMSNet": 0.94,
},
"CFD": {
"Nexa CFD Model": 0.92,
"FlowNet": 0.89,
},
}
# SCIEVAL/OSIR metrics data
SCIEVAL_METRICS = {
"Nexa Mistral Sci-7B": {
"OSIR (General)": {
"Entropy / Novelty": 6.2,
"Internal Consistency": 8.5,
"Hypothesis Framing": 6.8,
"Thematic Grounding": 7.9,
"Citation & Structure": 7.3,
"Symbolism & Math Logic": 6.1,
"Scientific Utility": 7.6
},
"OSIR-Field (Physics)": {
"Entropy / Novelty": 7.1,
"Internal Consistency": 8.9,
"Hypothesis Framing": 7.4,
"Thematic Grounding": 8.2,
"Citation & Structure": 6.5,
"Symbolism & Math Logic": 7.8,
"Scientific Utility": 8.3
}
}
}
def plot_domain_benchmark(domain):
"""Create horizontal bar chart for domain-specific benchmarks"""
models = list(MODEL_EVALS[domain].keys())
scores = list(MODEL_EVALS[domain].values())
# Color coding for Nexa models vs others
colors = ['#FF6B35' if 'Nexa' in model else '#4A90E2' for model in models]
fig = go.Figure()
fig.add_trace(go.Bar(
y=models,
x=scores,
orientation='h',
marker_color=colors,
text=[f'{score:.3f}' for score in scores],
textposition='auto'
))
fig.update_layout(
title=f"Model Benchmark Scores β€” {domain}",
yaxis_title="Model",
xaxis_title="Score",
xaxis_range=[0, 1.0],
template="plotly_white",
height=500,
showlegend=False
)
return fig
def plot_scieval_comparison(model_name):
"""Create horizontal comparison chart for SCIEVAL metrics"""
if model_name not in SCIEVAL_METRICS:
return go.Figure()
metrics = list(SCIEVAL_METRICS[model_name]["OSIR (General)"].keys())
osir_scores = list(SCIEVAL_METRICS[model_name]["OSIR (General)"].values())
field_scores = list(SCIEVAL_METRICS[model_name]["OSIR-Field (Physics)"].values())
fig = go.Figure()
fig.add_trace(go.Bar(
name='OSIR (General)',
y=metrics,
x=osir_scores,
orientation='h',
marker_color='#FFD700',
text=[f'{score:.1f}' for score in osir_scores],
textposition='auto'
))
fig.add_trace(go.Bar(
name='OSIR-Field (Physics)',
y=metrics,
x=field_scores,
orientation='h',
marker_color='#FF6B35',
text=[f'{score:.1f}' for score in field_scores],
textposition='auto'
))
fig.update_layout(
title=f"SCIEVAL Metrics Comparison β€” {model_name}",
yaxis_title="Metric",
xaxis_title="Score (1-10)",
xaxis_range=[0, 10],
template="plotly_white",
height=500,
barmode='group'
)
return fig
def create_leaderboard():
"""Create leaderboard table"""
leaderboard_data = []
# Add domain benchmark leaders
for domain, models in MODEL_EVALS.items():
best_model = max(models.items(), key=lambda x: x[1])
leaderboard_data.append({
"Domain": domain,
"Best Model": best_model[0],
"Score": f"{best_model[1]:.3f}",
"Metric Type": "Domain Benchmark"
})
# Add SCIEVAL leaders
for model, evaluations in SCIEVAL_METRICS.items():
avg_osir = sum(evaluations["OSIR (General)"].values()) / len(evaluations["OSIR (General)"])
avg_field = sum(evaluations["OSIR-Field (Physics)"].values()) / len(evaluations["OSIR-Field (Physics)"])
leaderboard_data.append({
"Domain": "OSIR General",
"Best Model": model,
"Score": f"{avg_osir:.2f}",
"Metric Type": "SCIEVAL"
})
leaderboard_data.append({
"Domain": "OSIR Physics",
"Best Model": model,
"Score": f"{avg_field:.2f}",
"Metric Type": "SCIEVAL"
})
df = pd.DataFrame(leaderboard_data)
return df
def get_model_details(domain):
"""Get JSON details for domain models"""
return json.dumps(MODEL_EVALS[domain], indent=2)
def display_domain_eval(domain):
"""Display domain evaluation results"""
plot = plot_domain_benchmark(domain)
details = get_model_details(domain)
return plot, details
def display_scieval(model_name):
"""Display SCIEVAL results"""
plot = plot_scieval_comparison(model_name)
if model_name in SCIEVAL_METRICS:
details = json.dumps(SCIEVAL_METRICS[model_name], indent=2)
else:
details = "Model not found in SCIEVAL database"
return plot, details
# Create Gradio interface
with gr.Blocks(title="Scientific ML Benchmark Suite", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# πŸ”¬ Scientific ML Benchmark Suite
### Comprehensive evaluation framework for scientific machine learning models
This suite combines domain-specific benchmarks with SCIEVAL (Scientific Evaluation) metrics to provide
comprehensive assessment of ML models across scientific disciplines.
""")
with gr.Tabs():
# Domain Benchmarks Tab
with gr.TabItem("πŸ§ͺ Domain Benchmarks"):
gr.Markdown("""
### Domain-Specific Model Evaluations
Compare models across scientific domains including Proteins, Astronomy, Materials Science,
Quantum State Tomography (QST), High Energy Physics (HEP), and Computational Fluid Dynamics (CFD).
""")
with gr.Row():
domain_dropdown = gr.Dropdown(
choices=list(MODEL_EVALS.keys()),
label="Select Scientific Domain",
value="Proteins"
)
domain_btn = gr.Button("Run Domain Evaluation", variant="primary")
with gr.Row():
domain_plot = gr.Plot(label="Domain Benchmark Results")
domain_metrics = gr.Code(label="Raw Scores (JSON)", language="json")
domain_btn.click(
display_domain_eval,
inputs=domain_dropdown,
outputs=[domain_plot, domain_metrics]
)
# SCIEVAL Tab
with gr.TabItem("πŸ“Š SCIEVAL Metrics"):
gr.Markdown("""
### SCIEVAL: Scientific Reasoning Evaluation
Assess models on scientific reasoning capabilities using the OSIR (Open Scientific Intelligence & Reasoning) framework.
**Metrics evaluated:**
- **Entropy/Novelty**: Originality and information richness
- **Internal Consistency**: Logical structure and argument continuity
- **Hypothesis Framing**: Research aim clarity
- **Thematic Grounding**: Domain focus and relevance
- **Citation & Structure**: Scientific formatting
- **Symbolism & Math Logic**: Mathematical rigor
- **Scientific Utility**: Real-world research value
""")
with gr.Row():
scieval_dropdown = gr.Dropdown(
choices=list(SCIEVAL_METRICS.keys()),
label="Select Model for SCIEVAL",
value="Nexa Mistral Sci-7B"
)
scieval_btn = gr.Button("Run SCIEVAL Analysis", variant="primary")
with gr.Row():
scieval_plot = gr.Plot(label="SCIEVAL Metrics Comparison")
scieval_metrics = gr.Code(label="Detailed Scores (JSON)", language="json")
scieval_btn.click(
display_scieval,
inputs=scieval_dropdown,
outputs=[scieval_plot, scieval_metrics]
)
# Leaderboard Tab
with gr.TabItem("πŸ† Leaderboard"):
gr.Markdown("""
### Scientific ML Model Leaderboard
Current best-performing models across all evaluated domains and metrics.
""")
leaderboard_df = create_leaderboard()
leaderboard_table = gr.Dataframe(
value=leaderboard_df,
label="Current Leaders by Domain",
interactive=False
)
# About Tab
with gr.TabItem("ℹ️ About"):
gr.Markdown("""
### About the Scientific ML Benchmark Suite
This comprehensive evaluation framework combines two powerful assessment methodologies:
#### Full reference gist for explaining the framework: https://gist.github.com/DarkStarStrix/4a2f4f91b8148e35574fc696ab9715e3
#### 🎯 Domain Benchmarks
- **Proteins**: Secondary/tertiary structure prediction accuracy
- **Astronomy**: Object classification and detection
- **Materials**: Property prediction and discovery
- **QST**: Quantum state tomography reconstruction
- **HEP**: High energy physics event classification
- **CFD**: Computational fluid dynamics modeling
#### πŸ”¬ SCIEVAL Framework
SCIEVAL is part of the OSIR (Open Scientific Intelligence & Reasoning) initiative, providing:
- **Standardized Evaluation**: Reproducible metrics for scientific LLMs
- **Domain Adaptation**: Field-specific evaluation extensions
- **Research Utility**: Assessment of real-world scientific value
**OSIR-Field Extensions:**
- `osir-field-physics`: Physics-specific reasoning evaluation
- `osir-field-bio`: Biological sciences assessment
- `osir-field-chem`: Chemistry domain evaluation
- `osir-field-cs`: Computer science applications
#### πŸ“ˆ Scoring System
- **Domain Benchmarks**: 0.0 - 1.0 scale (higher is better)
- **SCIEVAL Metrics**: 1 - 10 scale across seven dimensions
#### 🀝 Contributing
This is an open framework welcoming contributions:
- New domain-specific test sets
- Additional evaluation metrics
- Model submissions for benchmarking
#### πŸ“„ Citation
```
@misc{scieval2024,
title={SCIEVAL: A Benchmark for Evaluating Scientific Reasoning in Language Models},
author={NEXA Research},
year={2025},
url={https://huggingface.co/spaces/osir/scieval}
}
```
---
**License**: Apache 2.0 | **Framework**: OSIR Initiative | **Platform**: Gradio + Plotly
""")
# Initialize with default values
demo.load(
lambda: (plot_domain_benchmark("Proteins"), get_model_details("Proteins")),
outputs=[domain_plot, domain_metrics]
)
demo.load(
lambda: (plot_scieval_comparison("Nexa Mistral Sci-7B"),
json.dumps(SCIEVAL_METRICS["Nexa Mistral Sci-7B"], indent=2)),
outputs=[scieval_plot, scieval_metrics]
)
if __name__ == "__main__":
demo.launch()