File size: 12,083 Bytes
31a042b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e4b89b2
31a042b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e4b89b2
31a042b
 
 
 
 
 
 
 
e4b89b2
 
 
31a042b
 
 
 
 
 
 
e4b89b2
 
 
31a042b
 
 
 
 
 
 
e4b89b2
31a042b
 
 
 
 
 
 
 
 
 
 
e4b89b2
 
 
31a042b
 
 
 
 
 
 
e4b89b2
 
 
31a042b
 
 
 
 
 
 
e4b89b2
 
 
31a042b
 
e4b89b2
31a042b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e032949
 
 
31a042b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
import gradio as gr
import plotly.graph_objs as go
import plotly.express as px
import pandas as pd
import json

# Domain-specific model evaluations
MODEL_EVALS = {
    "Proteins": {
        "Nexa Bio1 (Secondary)": 0.71,
        "Porter6 (Secondary)": 0.8456,
        "DeepCNF (Secondary)": 0.85,
        "AlphaFold2 (Tertiary GDT-TS)": 0.924,
        "Nexa Bio2 (Tertiary)": 0.90,
    },
    "Astro": {
        "Nexa Astro": 0.97,
        "Baseline CNN": 0.89,
    },
    "Materials": {
        "Nexa Materials": 0.9999,
        "Random Forest Baseline": 0.92,
    },
    "QST": {
        "Nexa PIN Model": 0.80,
        "Quantum TomoNet": 0.85,
    },
    "HEP": {
        "Nexa HEP Model": 0.91,
        "CMSNet": 0.94,
    },
    "CFD": {
        "Nexa CFD Model": 0.92,
        "FlowNet": 0.89,
    },
}

# SCIEVAL/OSIR metrics data
SCIEVAL_METRICS = {
    "Nexa Mistral Sci-7B": {
        "OSIR (General)": {
            "Entropy / Novelty": 6.2,
            "Internal Consistency": 8.5,
            "Hypothesis Framing": 6.8,
            "Thematic Grounding": 7.9,
            "Citation & Structure": 7.3,
            "Symbolism & Math Logic": 6.1,
            "Scientific Utility": 7.6
        },
        "OSIR-Field (Physics)": {
            "Entropy / Novelty": 7.1,
            "Internal Consistency": 8.9,
            "Hypothesis Framing": 7.4,
            "Thematic Grounding": 8.2,
            "Citation & Structure": 6.5,
            "Symbolism & Math Logic": 7.8,
            "Scientific Utility": 8.3
        }
    }
}

def plot_domain_benchmark(domain):
    """Create horizontal bar chart for domain-specific benchmarks"""
    models = list(MODEL_EVALS[domain].keys())
    scores = list(MODEL_EVALS[domain].values())
    
    # Color coding for Nexa models vs others
    colors = ['#FF6B35' if 'Nexa' in model else '#4A90E2' for model in models]
    
    fig = go.Figure()
    fig.add_trace(go.Bar(
        y=models, 
        x=scores, 
        orientation='h',
        marker_color=colors,
        text=[f'{score:.3f}' for score in scores],
        textposition='auto'
    ))
    
    fig.update_layout(
        title=f"Model Benchmark Scores β€” {domain}",
        yaxis_title="Model",
        xaxis_title="Score",
        xaxis_range=[0, 1.0],
        template="plotly_white",
        height=500,
        showlegend=False
    )
    return fig

def plot_scieval_comparison(model_name):
    """Create horizontal comparison chart for SCIEVAL metrics"""
    if model_name not in SCIEVAL_METRICS:
        return go.Figure()
    
    metrics = list(SCIEVAL_METRICS[model_name]["OSIR (General)"].keys())
    osir_scores = list(SCIEVAL_METRICS[model_name]["OSIR (General)"].values())
    field_scores = list(SCIEVAL_METRICS[model_name]["OSIR-Field (Physics)"].values())
    
    fig = go.Figure()
    
    fig.add_trace(go.Bar(
        name='OSIR (General)',
        y=metrics,
        x=osir_scores,
        orientation='h',
        marker_color='#FFD700',
        text=[f'{score:.1f}' for score in osir_scores],
        textposition='auto'
    ))
    
    fig.add_trace(go.Bar(
        name='OSIR-Field (Physics)',
        y=metrics,
        x=field_scores,
        orientation='h',
        marker_color='#FF6B35',
        text=[f'{score:.1f}' for score in field_scores],
        textposition='auto'
    ))
    
    fig.update_layout(
        title=f"SCIEVAL Metrics Comparison β€” {model_name}",
        yaxis_title="Metric",
        xaxis_title="Score (1-10)",
        xaxis_range=[0, 10],
        template="plotly_white",
        height=500,
        barmode='group'
    )
    return fig

def create_leaderboard():
    """Create leaderboard table"""
    leaderboard_data = []
    
    # Add domain benchmark leaders
    for domain, models in MODEL_EVALS.items():
        best_model = max(models.items(), key=lambda x: x[1])
        leaderboard_data.append({
            "Domain": domain,
            "Best Model": best_model[0],
            "Score": f"{best_model[1]:.3f}",
            "Metric Type": "Domain Benchmark"
        })
    
    # Add SCIEVAL leaders
    for model, evaluations in SCIEVAL_METRICS.items():
        avg_osir = sum(evaluations["OSIR (General)"].values()) / len(evaluations["OSIR (General)"])
        avg_field = sum(evaluations["OSIR-Field (Physics)"].values()) / len(evaluations["OSIR-Field (Physics)"])
        
        leaderboard_data.append({
            "Domain": "OSIR General",
            "Best Model": model,
            "Score": f"{avg_osir:.2f}",
            "Metric Type": "SCIEVAL"
        })
        
        leaderboard_data.append({
            "Domain": "OSIR Physics",
            "Best Model": model,
            "Score": f"{avg_field:.2f}",
            "Metric Type": "SCIEVAL"
        })
    
    df = pd.DataFrame(leaderboard_data)
    return df

def get_model_details(domain):
    """Get JSON details for domain models"""
    return json.dumps(MODEL_EVALS[domain], indent=2)

def display_domain_eval(domain):
    """Display domain evaluation results"""
    plot = plot_domain_benchmark(domain)
    details = get_model_details(domain)
    return plot, details

def display_scieval(model_name):
    """Display SCIEVAL results"""
    plot = plot_scieval_comparison(model_name)
    if model_name in SCIEVAL_METRICS:
        details = json.dumps(SCIEVAL_METRICS[model_name], indent=2)
    else:
        details = "Model not found in SCIEVAL database"
    return plot, details

# Create Gradio interface
with gr.Blocks(title="Scientific ML Benchmark Suite", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # πŸ”¬ Scientific ML Benchmark Suite
    ### Comprehensive evaluation framework for scientific machine learning models
    
    This suite combines domain-specific benchmarks with SCIEVAL (Scientific Evaluation) metrics to provide 
    comprehensive assessment of ML models across scientific disciplines.
    """)
    
    with gr.Tabs():
        # Domain Benchmarks Tab
        with gr.TabItem("πŸ§ͺ Domain Benchmarks"):
            gr.Markdown("""
            ### Domain-Specific Model Evaluations
            Compare models across scientific domains including Proteins, Astronomy, Materials Science, 
            Quantum State Tomography (QST), High Energy Physics (HEP), and Computational Fluid Dynamics (CFD).
            """)
            
            with gr.Row():
                domain_dropdown = gr.Dropdown(
                    choices=list(MODEL_EVALS.keys()), 
                    label="Select Scientific Domain",
                    value="Proteins"
                )
                domain_btn = gr.Button("Run Domain Evaluation", variant="primary")
            
            with gr.Row():
                domain_plot = gr.Plot(label="Domain Benchmark Results")
                domain_metrics = gr.Code(label="Raw Scores (JSON)", language="json")
            
            domain_btn.click(
                display_domain_eval, 
                inputs=domain_dropdown, 
                outputs=[domain_plot, domain_metrics]
            )
        
        # SCIEVAL Tab
        with gr.TabItem("πŸ“Š SCIEVAL Metrics"):
            gr.Markdown("""
            ### SCIEVAL: Scientific Reasoning Evaluation
            Assess models on scientific reasoning capabilities using the OSIR (Open Scientific Intelligence & Reasoning) framework.
            
            **Metrics evaluated:**
            - **Entropy/Novelty**: Originality and information richness
            - **Internal Consistency**: Logical structure and argument continuity  
            - **Hypothesis Framing**: Research aim clarity
            - **Thematic Grounding**: Domain focus and relevance
            - **Citation & Structure**: Scientific formatting
            - **Symbolism & Math Logic**: Mathematical rigor
            - **Scientific Utility**: Real-world research value
            """)
            
            with gr.Row():
                scieval_dropdown = gr.Dropdown(
                    choices=list(SCIEVAL_METRICS.keys()),
                    label="Select Model for SCIEVAL",
                    value="Nexa Mistral Sci-7B"
                )
                scieval_btn = gr.Button("Run SCIEVAL Analysis", variant="primary")
            
            with gr.Row():
                scieval_plot = gr.Plot(label="SCIEVAL Metrics Comparison")
                scieval_metrics = gr.Code(label="Detailed Scores (JSON)", language="json")
            
            scieval_btn.click(
                display_scieval,
                inputs=scieval_dropdown,
                outputs=[scieval_plot, scieval_metrics]
            )
        
        # Leaderboard Tab
        with gr.TabItem("πŸ† Leaderboard"):
            gr.Markdown("""
            ### Scientific ML Model Leaderboard
            Current best-performing models across all evaluated domains and metrics.
            """)
            
            leaderboard_df = create_leaderboard()
            leaderboard_table = gr.Dataframe(
                value=leaderboard_df,
                label="Current Leaders by Domain",
                interactive=False
            )
        
        # About Tab
        with gr.TabItem("ℹ️ About"):
            gr.Markdown("""
            ### About the Scientific ML Benchmark Suite
            
            This comprehensive evaluation framework combines two powerful assessment methodologies:

            #### Full reference gist for explaining the framework: https://gist.github.com/DarkStarStrix/4a2f4f91b8148e35574fc696ab9715e3

            
            #### 🎯 Domain Benchmarks
            - **Proteins**: Secondary/tertiary structure prediction accuracy
            - **Astronomy**: Object classification and detection
            - **Materials**: Property prediction and discovery
            - **QST**: Quantum state tomography reconstruction
            - **HEP**: High energy physics event classification
            - **CFD**: Computational fluid dynamics modeling
            
            #### πŸ”¬ SCIEVAL Framework
            SCIEVAL is part of the OSIR (Open Scientific Intelligence & Reasoning) initiative, providing:
            
            - **Standardized Evaluation**: Reproducible metrics for scientific LLMs
            - **Domain Adaptation**: Field-specific evaluation extensions
            - **Research Utility**: Assessment of real-world scientific value
            
            **OSIR-Field Extensions:**
            - `osir-field-physics`: Physics-specific reasoning evaluation
            - `osir-field-bio`: Biological sciences assessment
            - `osir-field-chem`: Chemistry domain evaluation
            - `osir-field-cs`: Computer science applications
            
            #### πŸ“ˆ Scoring System
            - **Domain Benchmarks**: 0.0 - 1.0 scale (higher is better)
            - **SCIEVAL Metrics**: 1 - 10 scale across seven dimensions
            
            #### 🀝 Contributing
            This is an open framework welcoming contributions:
            - New domain-specific test sets
            - Additional evaluation metrics
            - Model submissions for benchmarking
            
            #### πŸ“„ Citation
            ```
            @misc{scieval2024,
              title={SCIEVAL: A Benchmark for Evaluating Scientific Reasoning in Language Models},
              author={NEXA Research},
              year={2025},
              url={https://huggingface.co/spaces/osir/scieval}
            }
            ```
            
            ---
            
            **License**: Apache 2.0 | **Framework**: OSIR Initiative | **Platform**: Gradio + Plotly
            """)
    
    # Initialize with default values
    demo.load(
        lambda: (plot_domain_benchmark("Proteins"), get_model_details("Proteins")),
        outputs=[domain_plot, domain_metrics]
    )
    
    demo.load(
        lambda: (plot_scieval_comparison("Nexa Mistral Sci-7B"), 
                json.dumps(SCIEVAL_METRICS["Nexa Mistral Sci-7B"], indent=2)),
        outputs=[scieval_plot, scieval_metrics]
    )

if __name__ == "__main__":
    demo.launch()