Spaces:
Running
Running
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import plotly.graph_objects as go
|
3 |
+
import os
|
4 |
+
|
5 |
+
# ============ Leaderboard Data ============
|
6 |
+
MODEL_EVALS = {
|
7 |
+
"LLM (General OSIR)": {
|
8 |
+
"Nexa Mistral Sci-7B": 0.61,
|
9 |
+
"Llama-3-8B-Instruct": 0.39,
|
10 |
+
"Mixtral-8x7B-Instruct-v0.1": 0.41,
|
11 |
+
"Claude-3-Sonnet": 0.64,
|
12 |
+
"GPT-4-Turbo": 0.68,
|
13 |
+
"GPT-4o": 0.71,
|
14 |
+
},
|
15 |
+
"LLM (Field-Specific OSIR)": {
|
16 |
+
"Nexa Bio Adapter": 0.66,
|
17 |
+
"Nexa Astro Adapter": 0.70,
|
18 |
+
"GPT-4o (Biomed)": 0.69,
|
19 |
+
"Claude-3-Opus (Bio)": 0.67,
|
20 |
+
"Llama-3-8B-Bio": 0.42,
|
21 |
+
"Mixtral-8x7B-BioTune": 0.43,
|
22 |
+
},
|
23 |
+
}
|
24 |
+
|
25 |
+
# ============ Plotting Function ============
|
26 |
+
def plot_domain(domain):
|
27 |
+
sorted_items = sorted(MODEL_EVALS[domain].items(), key=lambda x: x[1], reverse=True)
|
28 |
+
models, scores = zip(*sorted_items)
|
29 |
+
|
30 |
+
fig = go.Figure()
|
31 |
+
fig.add_trace(go.Bar(
|
32 |
+
x=scores,
|
33 |
+
y=models,
|
34 |
+
orientation='h',
|
35 |
+
marker_color='lightblue',
|
36 |
+
))
|
37 |
+
|
38 |
+
fig.update_layout(
|
39 |
+
title=f"Model vs. Overall Score — {domain}",
|
40 |
+
xaxis_title="Scientific Utility Score",
|
41 |
+
yaxis_title="Model",
|
42 |
+
xaxis_range=[0, 1.0],
|
43 |
+
template="plotly_white",
|
44 |
+
height=500,
|
45 |
+
margin=dict(l=120, r=20, t=40, b=40),
|
46 |
+
)
|
47 |
+
return fig
|
48 |
+
|
49 |
+
# ============ Upload Handling (for later use) ============
|
50 |
+
def handle_upload(file):
|
51 |
+
if file is not None:
|
52 |
+
return f"Uploaded: {file.name}"
|
53 |
+
return "No file uploaded."
|
54 |
+
|
55 |
+
# ============ Gradio UI ============
|
56 |
+
with gr.Blocks(css="body {font-family: 'Inter', sans-serif; background-color: #fafafa;}") as demo:
|
57 |
+
gr.Markdown("""
|
58 |
+
# 🧠 SciEval | OSIR Leaderboard
|
59 |
+
Welcome to the **OSIR** benchmarking suite for evaluating scientific language models. This leaderboard tracks general-purpose and field-specific reasoning performance across the **SciEval** benchmark.
|
60 |
+
""")
|
61 |
+
|
62 |
+
with gr.Row():
|
63 |
+
with gr.Column():
|
64 |
+
domain_choice = gr.Dropdown(choices=list(MODEL_EVALS.keys()), label="Select Evaluation Domain", value="LLM (General OSIR)")
|
65 |
+
leaderboard_plot = gr.Plot()
|
66 |
+
domain_choice.change(fn=plot_domain, inputs=domain_choice, outputs=leaderboard_plot)
|
67 |
+
|
68 |
+
with gr.Column():
|
69 |
+
gr.Markdown("""
|
70 |
+
### 📄 Upload Model Output
|
71 |
+
Upload a generated scientific paper or abstract (PDF or TXT).
|
72 |
+
""")
|
73 |
+
upload = gr.File(file_types=[".pdf", ".txt"])
|
74 |
+
upload_btn = gr.Button("Submit File")
|
75 |
+
result = gr.Textbox(label="Upload Status")
|
76 |
+
upload_btn.click(fn=handle_upload, inputs=upload, outputs=result)
|
77 |
+
|
78 |
+
gr.Markdown("""
|
79 |
+
---
|
80 |
+
### ℹ️ About
|
81 |
+
**SciEval** is a model-agnostic benchmark to evaluate the scientific utility of LLMs under the **OSIR** (Open Scientific Intelligence & Reasoning) initiative. We score models based on:
|
82 |
+
|
83 |
+
- Information entropy & novelty
|
84 |
+
- Internal consistency
|
85 |
+
- Hypothesis framing
|
86 |
+
- Domain grounding & math logic
|
87 |
+
- Scientific utility (overall use to researchers)
|
88 |
+
|
89 |
+
This leaderboard includes Nexa's adapters and comparisons to general-purpose LLMs like GPT-4o, Claude 3, and open-source Mistral / LLaMA.
|
90 |
+
""")
|
91 |
+
|
92 |
+
leaderboard_plot.render()
|
93 |
+
|
94 |
+
demo.launch()
|