Spaces:

Allanatrix
/

NexaEvals

Running

App Files Files Community

Allanatrix commited on 20 days ago

Commit

0bbd367

verified ·

1 Parent(s): e5b3e38

Create app.py

Browse files

Files changed (1) hide show

app.py +94 -0

app.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import gradio as gr
+import plotly.graph_objects as go
+import os
+# ============ Leaderboard Data ============
+MODEL_EVALS = {
+    "LLM (General OSIR)": {
+        "Nexa Mistral Sci-7B": 0.61,
+        "Llama-3-8B-Instruct": 0.39,
+        "Mixtral-8x7B-Instruct-v0.1": 0.41,
+        "Claude-3-Sonnet": 0.64,
+        "GPT-4-Turbo": 0.68,
+        "GPT-4o": 0.71,
+    },
+    "LLM (Field-Specific OSIR)": {
+        "Nexa Bio Adapter": 0.66,
+        "Nexa Astro Adapter": 0.70,
+        "GPT-4o (Biomed)": 0.69,
+        "Claude-3-Opus (Bio)": 0.67,
+        "Llama-3-8B-Bio": 0.42,
+        "Mixtral-8x7B-BioTune": 0.43,
+    },
+}
+# ============ Plotting Function ============
+def plot_domain(domain):
+    sorted_items = sorted(MODEL_EVALS[domain].items(), key=lambda x: x[1], reverse=True)
+    models, scores = zip(*sorted_items)
+    fig = go.Figure()
+    fig.add_trace(go.Bar(
+        x=scores,
+        y=models,
+        orientation='h',
+        marker_color='lightblue',
+    ))
+    fig.update_layout(
+        title=f"Model vs. Overall Score — {domain}",
+        xaxis_title="Scientific Utility Score",
+        yaxis_title="Model",
+        xaxis_range=[0, 1.0],
+        template="plotly_white",
+        height=500,
+        margin=dict(l=120, r=20, t=40, b=40),
+    )
+    return fig
+# ============ Upload Handling (for later use) ============
+def handle_upload(file):
+    if file is not None:
+        return f"Uploaded: {file.name}"
+    return "No file uploaded."
+# ============ Gradio UI ============
+with gr.Blocks(css="body {font-family: 'Inter', sans-serif; background-color: #fafafa;}") as demo:
+    gr.Markdown("""
+    # 🧠 SciEval | OSIR Leaderboard
+    Welcome to the **OSIR** benchmarking suite for evaluating scientific language models. This leaderboard tracks general-purpose and field-specific reasoning performance across the **SciEval** benchmark.
+    """)
+    with gr.Row():
+        with gr.Column():
+            domain_choice = gr.Dropdown(choices=list(MODEL_EVALS.keys()), label="Select Evaluation Domain", value="LLM (General OSIR)")
+            leaderboard_plot = gr.Plot()
+            domain_choice.change(fn=plot_domain, inputs=domain_choice, outputs=leaderboard_plot)
+        with gr.Column():
+            gr.Markdown("""
+            ### 📄 Upload Model Output
+            Upload a generated scientific paper or abstract (PDF or TXT).
+            """)
+            upload = gr.File(file_types=[".pdf", ".txt"])
+            upload_btn = gr.Button("Submit File")
+            result = gr.Textbox(label="Upload Status")
+            upload_btn.click(fn=handle_upload, inputs=upload, outputs=result)
+    gr.Markdown("""
+    ---
+    ### ℹ️ About
+    **SciEval** is a model-agnostic benchmark to evaluate the scientific utility of LLMs under the **OSIR** (Open Scientific Intelligence & Reasoning) initiative. We score models based on:
+    - Information entropy & novelty
+    - Internal consistency
+    - Hypothesis framing
+    - Domain grounding & math logic
+    - Scientific utility (overall use to researchers)
+    This leaderboard includes Nexa's adapters and comparisons to general-purpose LLMs like GPT-4o, Claude 3, and open-source Mistral / LLaMA.
+    """)
+    leaderboard_plot.render()
+demo.launch()