import gradio as gr import plotly.graph_objs as go import plotly.express as px import pandas as pd import json # Domain-specific model evaluations MODEL_EVALS = { "Proteins": { "Nexa Bio1 (Secondary)": 0.71, "Porter6 (Secondary)": 0.8456, "DeepCNF (Secondary)": 0.85, "AlphaFold2 (Tertiary GDT-TS)": 0.924, "Nexa Bio2 (Tertiary)": 0.90, }, "Astro": { "Nexa Astro": 0.97, "Baseline CNN": 0.89, }, "Materials": { "Nexa Materials": 0.9999, "Random Forest Baseline": 0.92, }, "QST": { "Nexa PIN Model": 0.80, "Quantum TomoNet": 0.85, }, "HEP": { "Nexa HEP Model": 0.91, "CMSNet": 0.94, }, "CFD": { "Nexa CFD Model": 0.92, "FlowNet": 0.89, }, } # SCIEVAL/OSIR metrics data SCIEVAL_METRICS = { "Nexa Mistral Sci-7B": { "OSIR (General)": { "Entropy / Novelty": 6.2, "Internal Consistency": 8.5, "Hypothesis Framing": 6.8, "Thematic Grounding": 7.9, "Citation & Structure": 7.3, "Symbolism & Math Logic": 6.1, "Scientific Utility": 7.6 }, "OSIR-Field (Physics)": { "Entropy / Novelty": 7.1, "Internal Consistency": 8.9, "Hypothesis Framing": 7.4, "Thematic Grounding": 8.2, "Citation & Structure": 6.5, "Symbolism & Math Logic": 7.8, "Scientific Utility": 8.3 } } } def plot_domain_benchmark(domain): """Create horizontal bar chart for domain-specific benchmarks""" models = list(MODEL_EVALS[domain].keys()) scores = list(MODEL_EVALS[domain].values()) # Color coding for Nexa models vs others colors = ['#FF6B35' if 'Nexa' in model else '#4A90E2' for model in models] fig = go.Figure() fig.add_trace(go.Bar( y=models, x=scores, orientation='h', marker_color=colors, text=[f'{score:.3f}' for score in scores], textposition='auto' )) fig.update_layout( title=f"Model Benchmark Scores โ€” {domain}", yaxis_title="Model", xaxis_title="Score", xaxis_range=[0, 1.0], template="plotly_white", height=500, showlegend=False ) return fig def plot_scieval_comparison(model_name): """Create horizontal comparison chart for SCIEVAL metrics""" if model_name not in SCIEVAL_METRICS: return go.Figure() metrics = list(SCIEVAL_METRICS[model_name]["OSIR (General)"].keys()) osir_scores = list(SCIEVAL_METRICS[model_name]["OSIR (General)"].values()) field_scores = list(SCIEVAL_METRICS[model_name]["OSIR-Field (Physics)"].values()) fig = go.Figure() fig.add_trace(go.Bar( name='OSIR (General)', y=metrics, x=osir_scores, orientation='h', marker_color='#FFD700', text=[f'{score:.1f}' for score in osir_scores], textposition='auto' )) fig.add_trace(go.Bar( name='OSIR-Field (Physics)', y=metrics, x=field_scores, orientation='h', marker_color='#FF6B35', text=[f'{score:.1f}' for score in field_scores], textposition='auto' )) fig.update_layout( title=f"SCIEVAL Metrics Comparison โ€” {model_name}", yaxis_title="Metric", xaxis_title="Score (1-10)", xaxis_range=[0, 10], template="plotly_white", height=500, barmode='group' ) return fig def create_leaderboard(): """Create leaderboard table""" leaderboard_data = [] # Add domain benchmark leaders for domain, models in MODEL_EVALS.items(): best_model = max(models.items(), key=lambda x: x[1]) leaderboard_data.append({ "Domain": domain, "Best Model": best_model[0], "Score": f"{best_model[1]:.3f}", "Metric Type": "Domain Benchmark" }) # Add SCIEVAL leaders for model, evaluations in SCIEVAL_METRICS.items(): avg_osir = sum(evaluations["OSIR (General)"].values()) / len(evaluations["OSIR (General)"]) avg_field = sum(evaluations["OSIR-Field (Physics)"].values()) / len(evaluations["OSIR-Field (Physics)"]) leaderboard_data.append({ "Domain": "OSIR General", "Best Model": model, "Score": f"{avg_osir:.2f}", "Metric Type": "SCIEVAL" }) leaderboard_data.append({ "Domain": "OSIR Physics", "Best Model": model, "Score": f"{avg_field:.2f}", "Metric Type": "SCIEVAL" }) df = pd.DataFrame(leaderboard_data) return df def get_model_details(domain): """Get JSON details for domain models""" return json.dumps(MODEL_EVALS[domain], indent=2) def display_domain_eval(domain): """Display domain evaluation results""" plot = plot_domain_benchmark(domain) details = get_model_details(domain) return plot, details def display_scieval(model_name): """Display SCIEVAL results""" plot = plot_scieval_comparison(model_name) if model_name in SCIEVAL_METRICS: details = json.dumps(SCIEVAL_METRICS[model_name], indent=2) else: details = "Model not found in SCIEVAL database" return plot, details # Create Gradio interface with gr.Blocks(title="Scientific ML Benchmark Suite", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # ๐Ÿ”ฌ Scientific ML Benchmark Suite ### Comprehensive evaluation framework for scientific machine learning models This suite combines domain-specific benchmarks with SCIEVAL (Scientific Evaluation) metrics to provide comprehensive assessment of ML models across scientific disciplines. """) with gr.Tabs(): # Domain Benchmarks Tab with gr.TabItem("๐Ÿงช Domain Benchmarks"): gr.Markdown(""" ### Domain-Specific Model Evaluations Compare models across scientific domains including Proteins, Astronomy, Materials Science, Quantum State Tomography (QST), High Energy Physics (HEP), and Computational Fluid Dynamics (CFD). """) with gr.Row(): domain_dropdown = gr.Dropdown( choices=list(MODEL_EVALS.keys()), label="Select Scientific Domain", value="Proteins" ) domain_btn = gr.Button("Run Domain Evaluation", variant="primary") with gr.Row(): domain_plot = gr.Plot(label="Domain Benchmark Results") domain_metrics = gr.Code(label="Raw Scores (JSON)", language="json") domain_btn.click( display_domain_eval, inputs=domain_dropdown, outputs=[domain_plot, domain_metrics] ) # SCIEVAL Tab with gr.TabItem("๐Ÿ“Š SCIEVAL Metrics"): gr.Markdown(""" ### SCIEVAL: Scientific Reasoning Evaluation Assess models on scientific reasoning capabilities using the OSIR (Open Scientific Intelligence & Reasoning) framework. **Metrics evaluated:** - **Entropy/Novelty**: Originality and information richness - **Internal Consistency**: Logical structure and argument continuity - **Hypothesis Framing**: Research aim clarity - **Thematic Grounding**: Domain focus and relevance - **Citation & Structure**: Scientific formatting - **Symbolism & Math Logic**: Mathematical rigor - **Scientific Utility**: Real-world research value """) with gr.Row(): scieval_dropdown = gr.Dropdown( choices=list(SCIEVAL_METRICS.keys()), label="Select Model for SCIEVAL", value="Nexa Mistral Sci-7B" ) scieval_btn = gr.Button("Run SCIEVAL Analysis", variant="primary") with gr.Row(): scieval_plot = gr.Plot(label="SCIEVAL Metrics Comparison") scieval_metrics = gr.Code(label="Detailed Scores (JSON)", language="json") scieval_btn.click( display_scieval, inputs=scieval_dropdown, outputs=[scieval_plot, scieval_metrics] ) # Leaderboard Tab with gr.TabItem("๐Ÿ† Leaderboard"): gr.Markdown(""" ### Scientific ML Model Leaderboard Current best-performing models across all evaluated domains and metrics. """) leaderboard_df = create_leaderboard() leaderboard_table = gr.Dataframe( value=leaderboard_df, label="Current Leaders by Domain", interactive=False ) # About Tab with gr.TabItem("โ„น๏ธ About"): gr.Markdown(""" ### About the Scientific ML Benchmark Suite This comprehensive evaluation framework combines two powerful assessment methodologies: #### Full reference gist for explaining the framework: https://gist.github.com/DarkStarStrix/4a2f4f91b8148e35574fc696ab9715e3 #### ๐ŸŽฏ Domain Benchmarks - **Proteins**: Secondary/tertiary structure prediction accuracy - **Astronomy**: Object classification and detection - **Materials**: Property prediction and discovery - **QST**: Quantum state tomography reconstruction - **HEP**: High energy physics event classification - **CFD**: Computational fluid dynamics modeling #### ๐Ÿ”ฌ SCIEVAL Framework SCIEVAL is part of the OSIR (Open Scientific Intelligence & Reasoning) initiative, providing: - **Standardized Evaluation**: Reproducible metrics for scientific LLMs - **Domain Adaptation**: Field-specific evaluation extensions - **Research Utility**: Assessment of real-world scientific value **OSIR-Field Extensions:** - `osir-field-physics`: Physics-specific reasoning evaluation - `osir-field-bio`: Biological sciences assessment - `osir-field-chem`: Chemistry domain evaluation - `osir-field-cs`: Computer science applications #### ๐Ÿ“ˆ Scoring System - **Domain Benchmarks**: 0.0 - 1.0 scale (higher is better) - **SCIEVAL Metrics**: 1 - 10 scale across seven dimensions #### ๐Ÿค Contributing This is an open framework welcoming contributions: - New domain-specific test sets - Additional evaluation metrics - Model submissions for benchmarking #### ๐Ÿ“„ Citation ``` @misc{scieval2024, title={SCIEVAL: A Benchmark for Evaluating Scientific Reasoning in Language Models}, author={NEXA Research}, year={2025}, url={https://huggingface.co/spaces/osir/scieval} } ``` --- **License**: Apache 2.0 | **Framework**: OSIR Initiative | **Platform**: Gradio + Plotly """) # Initialize with default values demo.load( lambda: (plot_domain_benchmark("Proteins"), get_model_details("Proteins")), outputs=[domain_plot, domain_metrics] ) demo.load( lambda: (plot_scieval_comparison("Nexa Mistral Sci-7B"), json.dumps(SCIEVAL_METRICS["Nexa Mistral Sci-7B"], indent=2)), outputs=[scieval_plot, scieval_metrics] ) if __name__ == "__main__": demo.launch()