Spaces:

Allanatrix
/

NexaEvals

Running

File size: 12,083 Bytes

import gradio as gr
import plotly.graph_objs as go
import plotly.express as px
import pandas as pd
import json

# Domain-specific model evaluations
MODEL_EVALS = {
    "Proteins": {
        "Nexa Bio1 (Secondary)": 0.71,
        "Porter6 (Secondary)": 0.8456,
        "DeepCNF (Secondary)": 0.85,
        "AlphaFold2 (Tertiary GDT-TS)": 0.924,
        "Nexa Bio2 (Tertiary)": 0.90,
    },
    "Astro": {
        "Nexa Astro": 0.97,
        "Baseline CNN": 0.89,
    },
    "Materials": {
        "Nexa Materials": 0.9999,
        "Random Forest Baseline": 0.92,
    },
    "QST": {
        "Nexa PIN Model": 0.80,
        "Quantum TomoNet": 0.85,
    },
    "HEP": {
        "Nexa HEP Model": 0.91,
        "CMSNet": 0.94,
    },
    "CFD": {
        "Nexa CFD Model": 0.92,
        "FlowNet": 0.89,
    },
}

# SCIEVAL/OSIR metrics data
SCIEVAL_METRICS = {
    "Nexa Mistral Sci-7B": {
        "OSIR (General)": {
            "Entropy / Novelty": 6.2,
            "Internal Consistency": 8.5,
            "Hypothesis Framing": 6.8,
            "Thematic Grounding": 7.9,
            "Citation & Structure": 7.3,
            "Symbolism & Math Logic": 6.1,
            "Scientific Utility": 7.6
        },
        "OSIR-Field (Physics)": {
            "Entropy / Novelty": 7.1,
            "Internal Consistency": 8.9,
            "Hypothesis Framing": 7.4,
            "Thematic Grounding": 8.2,
            "Citation & Structure": 6.5,
            "Symbolism & Math Logic": 7.8,
            "Scientific Utility": 8.3
        }
    }
}

def plot_domain_benchmark(domain):
    """Create horizontal bar chart for domain-specific benchmarks"""
    models = list(MODEL_EVALS[domain].keys())
    scores = list(MODEL_EVALS[domain].values())
    
    # Color coding for Nexa models vs others
    colors = ['#FF6B35' if 'Nexa' in model else '#4A90E2' for model in models]
    
    fig = go.Figure()
    fig.add_trace(go.Bar(
        y=models, 
        x=scores, 
        orientation='h',
        marker_color=colors,
        text=[f'{score:.3f}' for score in scores],
        textposition='auto'
    ))
    
    fig.update_layout(
        title=f"Model Benchmark Scores — {domain}",
        yaxis_title="Model",
        xaxis_title="Score",
        xaxis_range=[0, 1.0],
        template="plotly_white",
        height=500,
        showlegend=False
    )
    return fig

def plot_scieval_comparison(model_name):
    """Create horizontal comparison chart for SCIEVAL metrics"""
    if model_name not in SCIEVAL_METRICS:
        return go.Figure()
    
    metrics = list(SCIEVAL_METRICS[model_name]["OSIR (General)"].keys())
    osir_scores = list(SCIEVAL_METRICS[model_name]["OSIR (General)"].values())
    field_scores = list(SCIEVAL_METRICS[model_name]["OSIR-Field (Physics)"].values())
    
    fig = go.Figure()
    
    fig.add_trace(go.Bar(
        name='OSIR (General)',
        y=metrics,
        x=osir_scores,
        orientation='h',
        marker_color='#FFD700',
        text=[f'{score:.1f}' for score in osir_scores],
        textposition='auto'
    ))
    
    fig.add_trace(go.Bar(
        name='OSIR-Field (Physics)',
        y=metrics,
        x=field_scores,
        orientation='h',
        marker_color='#FF6B35',
        text=[f'{score:.1f}' for score in field_scores],
        textposition='auto'
    ))
    
    fig.update_layout(
        title=f"SCIEVAL Metrics Comparison — {model_name}",
        yaxis_title="Metric",
        xaxis_title="Score (1-10)",
        xaxis_range=[0, 10],
        template="plotly_white",
        height=500,
        barmode='group'
    )
    return fig

def create_leaderboard():
    """Create leaderboard table"""
    leaderboard_data = []
    
    # Add domain benchmark leaders
    for domain, models in MODEL_EVALS.items():
        best_model = max(models.items(), key=lambda x: x[1])
        leaderboard_data.append({
            "Domain": domain,
            "Best Model": best_model[0],
            "Score": f"{best_model[1]:.3f}",
            "Metric Type": "Domain Benchmark"
        })
    
    # Add SCIEVAL leaders
    for model, evaluations in SCIEVAL_METRICS.items():
        avg_osir = sum(evaluations["OSIR (General)"].values()) / len(evaluations["OSIR (General)"])
        avg_field = sum(evaluations["OSIR-Field (Physics)"].values()) / len(evaluations["OSIR-Field (Physics)"])
        
        leaderboard_data.append({
            "Domain": "OSIR General",
            "Best Model": model,
            "Score": f"{avg_osir:.2f}",
            "Metric Type": "SCIEVAL"
        })
        
        leaderboard_data.append({
            "Domain": "OSIR Physics",
            "Best Model": model,
            "Score": f"{avg_field:.2f}",
            "Metric Type": "SCIEVAL"
        })
    
    df = pd.DataFrame(leaderboard_data)
    return df

def get_model_details(domain):
    """Get JSON details for domain models"""
    return json.dumps(MODEL_EVALS[domain], indent=2)

def display_domain_eval(domain):
    """Display domain evaluation results"""
    plot = plot_domain_benchmark(domain)
    details = get_model_details(domain)
    return plot, details

def display_scieval(model_name):
    """Display SCIEVAL results"""
    plot = plot_scieval_comparison(model_name)
    if model_name in SCIEVAL_METRICS:
        details = json.dumps(SCIEVAL_METRICS[model_name], indent=2)
    else:
        details = "Model not found in SCIEVAL database"
    return plot, details

# Create Gradio interface
with gr.Blocks(title="Scientific ML Benchmark Suite", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🔬 Scientific ML Benchmark Suite
    ### Comprehensive evaluation framework for scientific machine learning models
    
    This suite combines domain-specific benchmarks with SCIEVAL (Scientific Evaluation) metrics to provide 
    comprehensive assessment of ML models across scientific disciplines.
    """)
    
    with gr.Tabs():
        # Domain Benchmarks Tab
        with gr.TabItem("🧪 Domain Benchmarks"):
            gr.Markdown("""
            ### Domain-Specific Model Evaluations
            Compare models across scientific domains including Proteins, Astronomy, Materials Science, 
            Quantum State Tomography (QST), High Energy Physics (HEP), and Computational Fluid Dynamics (CFD).
            """)
            
            with gr.Row():
                domain_dropdown = gr.Dropdown(
                    choices=list(MODEL_EVALS.keys()), 
                    label="Select Scientific Domain",
                    value="Proteins"
                )
                domain_btn = gr.Button("Run Domain Evaluation", variant="primary")
            
            with gr.Row():
                domain_plot = gr.Plot(label="Domain Benchmark Results")
                domain_metrics = gr.Code(label="Raw Scores (JSON)", language="json")
            
            domain_btn.click(
                display_domain_eval, 
                inputs=domain_dropdown, 
                outputs=[domain_plot, domain_metrics]
            )
        
        # SCIEVAL Tab
        with gr.TabItem("📊 SCIEVAL Metrics"):
            gr.Markdown("""
            ### SCIEVAL: Scientific Reasoning Evaluation
            Assess models on scientific reasoning capabilities using the OSIR (Open Scientific Intelligence & Reasoning) framework.
            
            **Metrics evaluated:**
            - **Entropy/Novelty**: Originality and information richness
            - **Internal Consistency**: Logical structure and argument continuity  
            - **Hypothesis Framing**: Research aim clarity
            - **Thematic Grounding**: Domain focus and relevance
            - **Citation & Structure**: Scientific formatting
            - **Symbolism & Math Logic**: Mathematical rigor
            - **Scientific Utility**: Real-world research value
            """)
            
            with gr.Row():
                scieval_dropdown = gr.Dropdown(
                    choices=list(SCIEVAL_METRICS.keys()),
                    label="Select Model for SCIEVAL",
                    value="Nexa Mistral Sci-7B"
                )
                scieval_btn = gr.Button("Run SCIEVAL Analysis", variant="primary")
            
            with gr.Row():
                scieval_plot = gr.Plot(label="SCIEVAL Metrics Comparison")
                scieval_metrics = gr.Code(label="Detailed Scores (JSON)", language="json")
            
            scieval_btn.click(
                display_scieval,
                inputs=scieval_dropdown,
                outputs=[scieval_plot, scieval_metrics]
            )
        
        # Leaderboard Tab
        with gr.TabItem("🏆 Leaderboard"):
            gr.Markdown("""
            ### Scientific ML Model Leaderboard
            Current best-performing models across all evaluated domains and metrics.
            """)
            
            leaderboard_df = create_leaderboard()
            leaderboard_table = gr.Dataframe(
                value=leaderboard_df,
                label="Current Leaders by Domain",
                interactive=False
            )
        
        # About Tab
        with gr.TabItem("ℹ️ About"):
            gr.Markdown("""
            ### About the Scientific ML Benchmark Suite
            
            This comprehensive evaluation framework combines two powerful assessment methodologies:

            #### Full reference gist for explaining the framework: https://gist.github.com/DarkStarStrix/4a2f4f91b8148e35574fc696ab9715e3

            
            #### 🎯 Domain Benchmarks
            - **Proteins**: Secondary/tertiary structure prediction accuracy
            - **Astronomy**: Object classification and detection
            - **Materials**: Property prediction and discovery
            - **QST**: Quantum state tomography reconstruction
            - **HEP**: High energy physics event classification
            - **CFD**: Computational fluid dynamics modeling
            
            #### 🔬 SCIEVAL Framework
            SCIEVAL is part of the OSIR (Open Scientific Intelligence & Reasoning) initiative, providing:
            
            - **Standardized Evaluation**: Reproducible metrics for scientific LLMs
            - **Domain Adaptation**: Field-specific evaluation extensions
            - **Research Utility**: Assessment of real-world scientific value
            
            **OSIR-Field Extensions:**
            - `osir-field-physics`: Physics-specific reasoning evaluation
            - `osir-field-bio`: Biological sciences assessment
            - `osir-field-chem`: Chemistry domain evaluation
            - `osir-field-cs`: Computer science applications
            
            #### 📈 Scoring System
            - **Domain Benchmarks**: 0.0 - 1.0 scale (higher is better)
            - **SCIEVAL Metrics**: 1 - 10 scale across seven dimensions
            
            #### 🤝 Contributing
            This is an open framework welcoming contributions:
            - New domain-specific test sets
            - Additional evaluation metrics
            - Model submissions for benchmarking
            
            #### 📄 Citation
            ```
            @misc{scieval2024,
              title={SCIEVAL: A Benchmark for Evaluating Scientific Reasoning in Language Models},
              author={NEXA Research},
              year={2025},
              url={https://huggingface.co/spaces/osir/scieval}
            }
            ```
            
            ---
            
            **License**: Apache 2.0 | **Framework**: OSIR Initiative | **Platform**: Gradio + Plotly
            """)
    
    # Initialize with default values
    demo.load(
        lambda: (plot_domain_benchmark("Proteins"), get_model_details("Proteins")),
        outputs=[domain_plot, domain_metrics]
    )
    
    demo.load(
        lambda: (plot_scieval_comparison("Nexa Mistral Sci-7B"), 
                json.dumps(SCIEVAL_METRICS["Nexa Mistral Sci-7B"], indent=2)),
        outputs=[scieval_plot, scieval_metrics]
    )

if __name__ == "__main__":
    demo.launch()