Allanatrix commited on
Commit
b0ad3dc
·
verified ·
1 Parent(s): 0bbd367

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -42
app.py CHANGED
@@ -1,9 +1,40 @@
1
  import gradio as gr
2
  import plotly.graph_objects as go
3
- import os
4
 
5
- # ============ Leaderboard Data ============
6
- MODEL_EVALS = {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  "LLM (General OSIR)": {
8
  "Nexa Mistral Sci-7B": 0.61,
9
  "Llama-3-8B-Instruct": 0.39,
@@ -22,9 +53,9 @@ MODEL_EVALS = {
22
  },
23
  }
24
 
25
- # ============ Plotting Function ============
26
- def plot_domain(domain):
27
- sorted_items = sorted(MODEL_EVALS[domain].items(), key=lambda x: x[1], reverse=True)
28
  models, scores = zip(*sorted_items)
29
 
30
  fig = go.Figure()
@@ -32,12 +63,12 @@ def plot_domain(domain):
32
  x=scores,
33
  y=models,
34
  orientation='h',
35
- marker_color='lightblue',
36
  ))
37
 
38
  fig.update_layout(
39
- title=f"Model vs. Overall Score — {domain}",
40
- xaxis_title="Scientific Utility Score",
41
  yaxis_title="Model",
42
  xaxis_range=[0, 1.0],
43
  template="plotly_white",
@@ -46,49 +77,68 @@ def plot_domain(domain):
46
  )
47
  return fig
48
 
49
- # ============ Upload Handling (for later use) ============
50
- def handle_upload(file):
51
- if file is not None:
52
- return f"Uploaded: {file.name}"
53
- return "No file uploaded."
 
 
 
 
 
 
 
 
 
54
 
55
- # ============ Gradio UI ============
56
  with gr.Blocks(css="body {font-family: 'Inter', sans-serif; background-color: #fafafa;}") as demo:
57
  gr.Markdown("""
58
- # 🧠 SciEval | OSIR Leaderboard
59
- Welcome to the **OSIR** benchmarking suite for evaluating scientific language models. This leaderboard tracks general-purpose and field-specific reasoning performance across the **SciEval** benchmark.
60
  """)
61
 
62
- with gr.Row():
63
- with gr.Column():
64
- domain_choice = gr.Dropdown(choices=list(MODEL_EVALS.keys()), label="Select Evaluation Domain", value="LLM (General OSIR)")
65
- leaderboard_plot = gr.Plot()
66
- domain_choice.change(fn=plot_domain, inputs=domain_choice, outputs=leaderboard_plot)
 
 
 
 
 
 
 
 
 
 
 
67
 
68
- with gr.Column():
69
- gr.Markdown("""
70
- ### 📄 Upload Model Output
71
- Upload a generated scientific paper or abstract (PDF or TXT).
72
- """)
73
- upload = gr.File(file_types=[".pdf", ".txt"])
74
- upload_btn = gr.Button("Submit File")
75
- result = gr.Textbox(label="Upload Status")
76
- upload_btn.click(fn=handle_upload, inputs=upload, outputs=result)
 
 
 
 
 
 
77
 
78
  gr.Markdown("""
79
  ---
80
  ### ℹ️ About
81
- **SciEval** is a model-agnostic benchmark to evaluate the scientific utility of LLMs under the **OSIR** (Open Scientific Intelligence & Reasoning) initiative. We score models based on:
82
-
83
- - Information entropy & novelty
84
- - Internal consistency
85
- - Hypothesis framing
86
- - Domain grounding & math logic
87
- - Scientific utility (overall use to researchers)
88
-
89
- This leaderboard includes Nexa's adapters and comparisons to general-purpose LLMs like GPT-4o, Claude 3, and open-source Mistral / LLaMA.
90
  """)
91
 
92
- leaderboard_plot.render()
93
-
94
  demo.launch()
 
1
  import gradio as gr
2
  import plotly.graph_objects as go
3
+ import json
4
 
5
+ # Data for tabular models
6
+ TABULAR_MODEL_EVALS = {
7
+ "Proteins": {
8
+ "Nexa Bio1 (Secondary)": 0.71,
9
+ "Porter6 (Secondary)": 0.8456,
10
+ "DeepCNF (Secondary)": 0.85,
11
+ "AlphaFold2 (Tertiary GDT-TS)": 0.924,
12
+ "Nexa Bio2 (Tertiary)": 0.90,
13
+ },
14
+ "Astro": {
15
+ "Nexa Astro": 0.97,
16
+ "Baseline CNN": 0.89,
17
+ },
18
+ "Materials": {
19
+ "Nexa Materials": 0.9999,
20
+ "Random Forest Baseline": 0.92,
21
+ },
22
+ "QST": {
23
+ "Nexa PIN Model": 0.80,
24
+ "Quantum TomoNet": 0.85,
25
+ },
26
+ "HEP": {
27
+ "Nexa HEP Model": 0.91,
28
+ "CMSNet": 0.94,
29
+ },
30
+ "CFD": {
31
+ "Nexa CFD Model": 0.92,
32
+ "FlowNet": 0.89,
33
+ },
34
+ }
35
+
36
+ # Data for LLMs
37
+ LLM_MODEL_EVALS = {
38
  "LLM (General OSIR)": {
39
  "Nexa Mistral Sci-7B": 0.61,
40
  "Llama-3-8B-Instruct": 0.39,
 
53
  },
54
  }
55
 
56
+ # Universal plotting function for horizontal bar charts
57
+ def plot_horizontal_bar(domain, data, color):
58
+ sorted_items = sorted(data.items(), key=lambda x: x[1], reverse=True)
59
  models, scores = zip(*sorted_items)
60
 
61
  fig = go.Figure()
 
63
  x=scores,
64
  y=models,
65
  orientation='h',
66
+ marker_color=color,
67
  ))
68
 
69
  fig.update_layout(
70
+ title=f"Model Benchmark Scores — {domain}",
71
+ xaxis_title="Score",
72
  yaxis_title="Model",
73
  xaxis_range=[0, 1.0],
74
  template="plotly_white",
 
77
  )
78
  return fig
79
 
80
+ # Display functions for each section
81
+ def display_tabular_eval(domain):
82
+ if domain not in TABULAR_MODEL_EVALS:
83
+ return None, "Invalid domain selected"
84
+ plot = plot_horizontal_bar(domain, TABULAR_MODEL_EVALS[domain], 'indigo')
85
+ details = json.dumps(TABULAR_MODEL_EVALS[domain], indent=2)
86
+ return plot, details
87
+
88
+ def display_llm_eval(domain):
89
+ if domain not in LLM_MODEL_EVALS:
90
+ return None, "Invalid domain selected"
91
+ plot = plot_horizontal_bar(domain, LLM_MODEL_EVALS[domain], 'lightblue')
92
+ details = json.dumps(LLM_MODEL_EVALS[domain], indent=2)
93
+ return plot, details
94
 
95
+ # Gradio interface
96
  with gr.Blocks(css="body {font-family: 'Inter', sans-serif; background-color: #fafafa;}") as demo:
97
  gr.Markdown("""
98
+ # 🔬 Nexa Evals Scientific ML Benchmark Suite
99
+ A comprehensive benchmarking suite comparing Nexa models against state-of-the-art models across scientific domains and language models.
100
  """)
101
 
102
+ with gr.Tabs():
103
+ with gr.TabItem("Tabular Models"):
104
+ with gr.Row():
105
+ tabular_domain = gr.Dropdown(
106
+ choices=list(TABULAR_MODEL_EVALS.keys()),
107
+ label="Select Domain",
108
+ value="Proteins"
109
+ )
110
+ show_tabular_btn = gr.Button("Show Evaluation")
111
+ tabular_plot = gr.Plot(label="Benchmark Plot")
112
+ tabular_details = gr.Code(label="Raw Scores (JSON)", language="json")
113
+ show_tabular_btn.click(
114
+ fn=display_tabular_eval,
115
+ inputs=tabular_domain,
116
+ outputs=[tabular_plot, tabular_details]
117
+ )
118
 
119
+ with gr.TabItem("LLMs"):
120
+ with gr.Row():
121
+ llm_domain = gr.Dropdown(
122
+ choices=list(LLM_MODEL_EVALS.keys()),
123
+ label="Select Domain",
124
+ value="LLM (General OSIR)"
125
+ )
126
+ show_llm_btn = gr.Button("Show Evaluation")
127
+ llm_plot = gr.Plot(label="Benchmark Plot")
128
+ llm_details = gr.Code(label="Raw Scores (JSON)", language="json")
129
+ show_llm_btn.click(
130
+ fn=display_llm_eval,
131
+ inputs=llm_domain,
132
+ outputs=[llm_plot, llm_details]
133
+ )
134
 
135
  gr.Markdown("""
136
  ---
137
  ### ℹ️ About
138
+ Nexa Evals provides benchmarks for both tabular models and language models in scientific domains:
139
+ - **Tabular Models**: Evaluated on domain-specific metrics (e.g., accuracy, GDT-TS) across fields like Proteins, Astro, Materials, QST, HEP, and CFD.
140
+ - **Language Models**: Assessed using the SciEval benchmark under the OSIR initiative, focusing on scientific utility, information entropy, internal consistency, hypothesis framing, domain grounding, and math logic.
141
+ Scores range from 0 to 1, with higher values indicating better performance. Models are sorted by score in descending order for easy comparison.
 
 
 
 
 
142
  """)
143
 
 
 
144
  demo.launch()