Spaces:
Running
Running
import gradio as gr | |
import plotly.graph_objs as go | |
import plotly.express as px | |
import pandas as pd | |
import json | |
# Domain-specific model evaluations | |
MODEL_EVALS = { | |
"Proteins": { | |
"Nexa Bio1 (Secondary)": 0.71, | |
"Porter6 (Secondary)": 0.8456, | |
"DeepCNF (Secondary)": 0.85, | |
"AlphaFold2 (Tertiary GDT-TS)": 0.924, | |
"Nexa Bio2 (Tertiary)": 0.90, | |
}, | |
"Astro": { | |
"Nexa Astro": 0.97, | |
"Baseline CNN": 0.89, | |
}, | |
"Materials": { | |
"Nexa Materials": 0.9999, | |
"Random Forest Baseline": 0.92, | |
}, | |
"QST": { | |
"Nexa PIN Model": 0.80, | |
"Quantum TomoNet": 0.85, | |
}, | |
"HEP": { | |
"Nexa HEP Model": 0.91, | |
"CMSNet": 0.94, | |
}, | |
"CFD": { | |
"Nexa CFD Model": 0.92, | |
"FlowNet": 0.89, | |
}, | |
} | |
# SCIEVAL/OSIR metrics data | |
SCIEVAL_METRICS = { | |
"Nexa Mistral Sci-7B": { | |
"OSIR (General)": { | |
"Entropy / Novelty": 6.2, | |
"Internal Consistency": 8.5, | |
"Hypothesis Framing": 6.8, | |
"Thematic Grounding": 7.9, | |
"Citation & Structure": 7.3, | |
"Symbolism & Math Logic": 6.1, | |
"Scientific Utility": 7.6 | |
}, | |
"OSIR-Field (Physics)": { | |
"Entropy / Novelty": 7.1, | |
"Internal Consistency": 8.9, | |
"Hypothesis Framing": 7.4, | |
"Thematic Grounding": 8.2, | |
"Citation & Structure": 6.5, | |
"Symbolism & Math Logic": 7.8, | |
"Scientific Utility": 8.3 | |
} | |
} | |
} | |
def plot_domain_benchmark(domain): | |
"""Create horizontal bar chart for domain-specific benchmarks""" | |
models = list(MODEL_EVALS[domain].keys()) | |
scores = list(MODEL_EVALS[domain].values()) | |
# Color coding for Nexa models vs others | |
colors = ['#FF6B35' if 'Nexa' in model else '#4A90E2' for model in models] | |
fig = go.Figure() | |
fig.add_trace(go.Bar( | |
y=models, | |
x=scores, | |
orientation='h', | |
marker_color=colors, | |
text=[f'{score:.3f}' for score in scores], | |
textposition='auto' | |
)) | |
fig.update_layout( | |
title=f"Model Benchmark Scores β {domain}", | |
yaxis_title="Model", | |
xaxis_title="Score", | |
xaxis_range=[0, 1.0], | |
template="plotly_white", | |
height=500, | |
showlegend=False | |
) | |
return fig | |
def plot_scieval_comparison(model_name): | |
"""Create horizontal comparison chart for SCIEVAL metrics""" | |
if model_name not in SCIEVAL_METRICS: | |
return go.Figure() | |
metrics = list(SCIEVAL_METRICS[model_name]["OSIR (General)"].keys()) | |
osir_scores = list(SCIEVAL_METRICS[model_name]["OSIR (General)"].values()) | |
field_scores = list(SCIEVAL_METRICS[model_name]["OSIR-Field (Physics)"].values()) | |
fig = go.Figure() | |
fig.add_trace(go.Bar( | |
name='OSIR (General)', | |
y=metrics, | |
x=osir_scores, | |
orientation='h', | |
marker_color='#FFD700', | |
text=[f'{score:.1f}' for score in osir_scores], | |
textposition='auto' | |
)) | |
fig.add_trace(go.Bar( | |
name='OSIR-Field (Physics)', | |
y=metrics, | |
x=field_scores, | |
orientation='h', | |
marker_color='#FF6B35', | |
text=[f'{score:.1f}' for score in field_scores], | |
textposition='auto' | |
)) | |
fig.update_layout( | |
title=f"SCIEVAL Metrics Comparison β {model_name}", | |
yaxis_title="Metric", | |
xaxis_title="Score (1-10)", | |
xaxis_range=[0, 10], | |
template="plotly_white", | |
height=500, | |
barmode='group' | |
) | |
return fig | |
def create_leaderboard(): | |
"""Create leaderboard table""" | |
leaderboard_data = [] | |
# Add domain benchmark leaders | |
for domain, models in MODEL_EVALS.items(): | |
best_model = max(models.items(), key=lambda x: x[1]) | |
leaderboard_data.append({ | |
"Domain": domain, | |
"Best Model": best_model[0], | |
"Score": f"{best_model[1]:.3f}", | |
"Metric Type": "Domain Benchmark" | |
}) | |
# Add SCIEVAL leaders | |
for model, evaluations in SCIEVAL_METRICS.items(): | |
avg_osir = sum(evaluations["OSIR (General)"].values()) / len(evaluations["OSIR (General)"]) | |
avg_field = sum(evaluations["OSIR-Field (Physics)"].values()) / len(evaluations["OSIR-Field (Physics)"]) | |
leaderboard_data.append({ | |
"Domain": "OSIR General", | |
"Best Model": model, | |
"Score": f"{avg_osir:.2f}", | |
"Metric Type": "SCIEVAL" | |
}) | |
leaderboard_data.append({ | |
"Domain": "OSIR Physics", | |
"Best Model": model, | |
"Score": f"{avg_field:.2f}", | |
"Metric Type": "SCIEVAL" | |
}) | |
df = pd.DataFrame(leaderboard_data) | |
return df | |
def get_model_details(domain): | |
"""Get JSON details for domain models""" | |
return json.dumps(MODEL_EVALS[domain], indent=2) | |
def display_domain_eval(domain): | |
"""Display domain evaluation results""" | |
plot = plot_domain_benchmark(domain) | |
details = get_model_details(domain) | |
return plot, details | |
def display_scieval(model_name): | |
"""Display SCIEVAL results""" | |
plot = plot_scieval_comparison(model_name) | |
if model_name in SCIEVAL_METRICS: | |
details = json.dumps(SCIEVAL_METRICS[model_name], indent=2) | |
else: | |
details = "Model not found in SCIEVAL database" | |
return plot, details | |
# Create Gradio interface | |
with gr.Blocks(title="Scientific ML Benchmark Suite", theme=gr.themes.Soft()) as demo: | |
gr.Markdown(""" | |
# π¬ Scientific ML Benchmark Suite | |
### Comprehensive evaluation framework for scientific machine learning models | |
This suite combines domain-specific benchmarks with SCIEVAL (Scientific Evaluation) metrics to provide | |
comprehensive assessment of ML models across scientific disciplines. | |
""") | |
with gr.Tabs(): | |
# Domain Benchmarks Tab | |
with gr.TabItem("π§ͺ Domain Benchmarks"): | |
gr.Markdown(""" | |
### Domain-Specific Model Evaluations | |
Compare models across scientific domains including Proteins, Astronomy, Materials Science, | |
Quantum State Tomography (QST), High Energy Physics (HEP), and Computational Fluid Dynamics (CFD). | |
""") | |
with gr.Row(): | |
domain_dropdown = gr.Dropdown( | |
choices=list(MODEL_EVALS.keys()), | |
label="Select Scientific Domain", | |
value="Proteins" | |
) | |
domain_btn = gr.Button("Run Domain Evaluation", variant="primary") | |
with gr.Row(): | |
domain_plot = gr.Plot(label="Domain Benchmark Results") | |
domain_metrics = gr.Code(label="Raw Scores (JSON)", language="json") | |
domain_btn.click( | |
display_domain_eval, | |
inputs=domain_dropdown, | |
outputs=[domain_plot, domain_metrics] | |
) | |
# SCIEVAL Tab | |
with gr.TabItem("π SCIEVAL Metrics"): | |
gr.Markdown(""" | |
### SCIEVAL: Scientific Reasoning Evaluation | |
Assess models on scientific reasoning capabilities using the OSIR (Open Scientific Intelligence & Reasoning) framework. | |
**Metrics evaluated:** | |
- **Entropy/Novelty**: Originality and information richness | |
- **Internal Consistency**: Logical structure and argument continuity | |
- **Hypothesis Framing**: Research aim clarity | |
- **Thematic Grounding**: Domain focus and relevance | |
- **Citation & Structure**: Scientific formatting | |
- **Symbolism & Math Logic**: Mathematical rigor | |
- **Scientific Utility**: Real-world research value | |
""") | |
with gr.Row(): | |
scieval_dropdown = gr.Dropdown( | |
choices=list(SCIEVAL_METRICS.keys()), | |
label="Select Model for SCIEVAL", | |
value="Nexa Mistral Sci-7B" | |
) | |
scieval_btn = gr.Button("Run SCIEVAL Analysis", variant="primary") | |
with gr.Row(): | |
scieval_plot = gr.Plot(label="SCIEVAL Metrics Comparison") | |
scieval_metrics = gr.Code(label="Detailed Scores (JSON)", language="json") | |
scieval_btn.click( | |
display_scieval, | |
inputs=scieval_dropdown, | |
outputs=[scieval_plot, scieval_metrics] | |
) | |
# Leaderboard Tab | |
with gr.TabItem("π Leaderboard"): | |
gr.Markdown(""" | |
### Scientific ML Model Leaderboard | |
Current best-performing models across all evaluated domains and metrics. | |
""") | |
leaderboard_df = create_leaderboard() | |
leaderboard_table = gr.Dataframe( | |
value=leaderboard_df, | |
label="Current Leaders by Domain", | |
interactive=False | |
) | |
# About Tab | |
with gr.TabItem("βΉοΈ About"): | |
gr.Markdown(""" | |
### About the Scientific ML Benchmark Suite | |
This comprehensive evaluation framework combines two powerful assessment methodologies: | |
#### Full reference gist for explaining the framework: https://gist.github.com/DarkStarStrix/4a2f4f91b8148e35574fc696ab9715e3 | |
#### π― Domain Benchmarks | |
- **Proteins**: Secondary/tertiary structure prediction accuracy | |
- **Astronomy**: Object classification and detection | |
- **Materials**: Property prediction and discovery | |
- **QST**: Quantum state tomography reconstruction | |
- **HEP**: High energy physics event classification | |
- **CFD**: Computational fluid dynamics modeling | |
#### π¬ SCIEVAL Framework | |
SCIEVAL is part of the OSIR (Open Scientific Intelligence & Reasoning) initiative, providing: | |
- **Standardized Evaluation**: Reproducible metrics for scientific LLMs | |
- **Domain Adaptation**: Field-specific evaluation extensions | |
- **Research Utility**: Assessment of real-world scientific value | |
**OSIR-Field Extensions:** | |
- `osir-field-physics`: Physics-specific reasoning evaluation | |
- `osir-field-bio`: Biological sciences assessment | |
- `osir-field-chem`: Chemistry domain evaluation | |
- `osir-field-cs`: Computer science applications | |
#### π Scoring System | |
- **Domain Benchmarks**: 0.0 - 1.0 scale (higher is better) | |
- **SCIEVAL Metrics**: 1 - 10 scale across seven dimensions | |
#### π€ Contributing | |
This is an open framework welcoming contributions: | |
- New domain-specific test sets | |
- Additional evaluation metrics | |
- Model submissions for benchmarking | |
#### π Citation | |
``` | |
@misc{scieval2024, | |
title={SCIEVAL: A Benchmark for Evaluating Scientific Reasoning in Language Models}, | |
author={NEXA Research}, | |
year={2025}, | |
url={https://huggingface.co/spaces/osir/scieval} | |
} | |
``` | |
--- | |
**License**: Apache 2.0 | **Framework**: OSIR Initiative | **Platform**: Gradio + Plotly | |
""") | |
# Initialize with default values | |
demo.load( | |
lambda: (plot_domain_benchmark("Proteins"), get_model_details("Proteins")), | |
outputs=[domain_plot, domain_metrics] | |
) | |
demo.load( | |
lambda: (plot_scieval_comparison("Nexa Mistral Sci-7B"), | |
json.dumps(SCIEVAL_METRICS["Nexa Mistral Sci-7B"], indent=2)), | |
outputs=[scieval_plot, scieval_metrics] | |
) | |
if __name__ == "__main__": | |
demo.launch() |