Allanatrix commited on
Commit
0bbd367
·
verified ·
1 Parent(s): e5b3e38

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -0
app.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import plotly.graph_objects as go
3
+ import os
4
+
5
+ # ============ Leaderboard Data ============
6
+ MODEL_EVALS = {
7
+ "LLM (General OSIR)": {
8
+ "Nexa Mistral Sci-7B": 0.61,
9
+ "Llama-3-8B-Instruct": 0.39,
10
+ "Mixtral-8x7B-Instruct-v0.1": 0.41,
11
+ "Claude-3-Sonnet": 0.64,
12
+ "GPT-4-Turbo": 0.68,
13
+ "GPT-4o": 0.71,
14
+ },
15
+ "LLM (Field-Specific OSIR)": {
16
+ "Nexa Bio Adapter": 0.66,
17
+ "Nexa Astro Adapter": 0.70,
18
+ "GPT-4o (Biomed)": 0.69,
19
+ "Claude-3-Opus (Bio)": 0.67,
20
+ "Llama-3-8B-Bio": 0.42,
21
+ "Mixtral-8x7B-BioTune": 0.43,
22
+ },
23
+ }
24
+
25
+ # ============ Plotting Function ============
26
+ def plot_domain(domain):
27
+ sorted_items = sorted(MODEL_EVALS[domain].items(), key=lambda x: x[1], reverse=True)
28
+ models, scores = zip(*sorted_items)
29
+
30
+ fig = go.Figure()
31
+ fig.add_trace(go.Bar(
32
+ x=scores,
33
+ y=models,
34
+ orientation='h',
35
+ marker_color='lightblue',
36
+ ))
37
+
38
+ fig.update_layout(
39
+ title=f"Model vs. Overall Score — {domain}",
40
+ xaxis_title="Scientific Utility Score",
41
+ yaxis_title="Model",
42
+ xaxis_range=[0, 1.0],
43
+ template="plotly_white",
44
+ height=500,
45
+ margin=dict(l=120, r=20, t=40, b=40),
46
+ )
47
+ return fig
48
+
49
+ # ============ Upload Handling (for later use) ============
50
+ def handle_upload(file):
51
+ if file is not None:
52
+ return f"Uploaded: {file.name}"
53
+ return "No file uploaded."
54
+
55
+ # ============ Gradio UI ============
56
+ with gr.Blocks(css="body {font-family: 'Inter', sans-serif; background-color: #fafafa;}") as demo:
57
+ gr.Markdown("""
58
+ # 🧠 SciEval | OSIR Leaderboard
59
+ Welcome to the **OSIR** benchmarking suite for evaluating scientific language models. This leaderboard tracks general-purpose and field-specific reasoning performance across the **SciEval** benchmark.
60
+ """)
61
+
62
+ with gr.Row():
63
+ with gr.Column():
64
+ domain_choice = gr.Dropdown(choices=list(MODEL_EVALS.keys()), label="Select Evaluation Domain", value="LLM (General OSIR)")
65
+ leaderboard_plot = gr.Plot()
66
+ domain_choice.change(fn=plot_domain, inputs=domain_choice, outputs=leaderboard_plot)
67
+
68
+ with gr.Column():
69
+ gr.Markdown("""
70
+ ### 📄 Upload Model Output
71
+ Upload a generated scientific paper or abstract (PDF or TXT).
72
+ """)
73
+ upload = gr.File(file_types=[".pdf", ".txt"])
74
+ upload_btn = gr.Button("Submit File")
75
+ result = gr.Textbox(label="Upload Status")
76
+ upload_btn.click(fn=handle_upload, inputs=upload, outputs=result)
77
+
78
+ gr.Markdown("""
79
+ ---
80
+ ### ℹ️ About
81
+ **SciEval** is a model-agnostic benchmark to evaluate the scientific utility of LLMs under the **OSIR** (Open Scientific Intelligence & Reasoning) initiative. We score models based on:
82
+
83
+ - Information entropy & novelty
84
+ - Internal consistency
85
+ - Hypothesis framing
86
+ - Domain grounding & math logic
87
+ - Scientific utility (overall use to researchers)
88
+
89
+ This leaderboard includes Nexa's adapters and comparisons to general-purpose LLMs like GPT-4o, Claude 3, and open-source Mistral / LLaMA.
90
+ """)
91
+
92
+ leaderboard_plot.render()
93
+
94
+ demo.launch()