Spaces:

Allanatrix
/

NexaEvals

Running

App Files Files Community

Allanatrix commited on 19 days ago

Commit

b0ad3dc

verified ·

1 Parent(s): 0bbd367

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -42

app.py CHANGED Viewed

@@ -1,9 +1,40 @@
 import gradio as gr
 import plotly.graph_objects as go
-import os
-# ============ Leaderboard Data ============
-MODEL_EVALS = {
     "LLM (General OSIR)": {
         "Nexa Mistral Sci-7B": 0.61,
         "Llama-3-8B-Instruct": 0.39,
@@ -22,9 +53,9 @@ MODEL_EVALS = {
     },
 }
-# ============ Plotting Function ============
-def plot_domain(domain):
-    sorted_items = sorted(MODEL_EVALS[domain].items(), key=lambda x: x[1], reverse=True)
     models, scores = zip(*sorted_items)
     fig = go.Figure()
@@ -32,12 +63,12 @@ def plot_domain(domain):
         x=scores,
         y=models,
         orientation='h',
-        marker_color='lightblue',
     ))
     fig.update_layout(
-        title=f"Model vs. Overall Score — {domain}",
-        xaxis_title="Scientific Utility Score",
         yaxis_title="Model",
         xaxis_range=[0, 1.0],
         template="plotly_white",
@@ -46,49 +77,68 @@ def plot_domain(domain):
     )
     return fig
-# ============ Upload Handling (for later use) ============
-def handle_upload(file):
-    if file is not None:
-        return f"Uploaded: {file.name}"
-    return "No file uploaded."
-# ============ Gradio UI ============
 with gr.Blocks(css="body {font-family: 'Inter', sans-serif; background-color: #fafafa;}") as demo:
     gr.Markdown("""
-    # 🧠 SciEval | OSIR Leaderboard
-    Welcome to the **OSIR** benchmarking suite for evaluating scientific language models. This leaderboard tracks general-purpose and field-specific reasoning performance across the **SciEval** benchmark.
     """)
-    with gr.Row():
-        with gr.Column():
-            domain_choice = gr.Dropdown(choices=list(MODEL_EVALS.keys()), label="Select Evaluation Domain", value="LLM (General OSIR)")
-            leaderboard_plot = gr.Plot()
-            domain_choice.change(fn=plot_domain, inputs=domain_choice, outputs=leaderboard_plot)
-        with gr.Column():
-            gr.Markdown("""
-            ### 📄 Upload Model Output
-            Upload a generated scientific paper or abstract (PDF or TXT).
-            """)
-            upload = gr.File(file_types=[".pdf", ".txt"])
-            upload_btn = gr.Button("Submit File")
-            result = gr.Textbox(label="Upload Status")
-            upload_btn.click(fn=handle_upload, inputs=upload, outputs=result)
     gr.Markdown("""
     ---
     ### ℹ️ About
-    **SciEval** is a model-agnostic benchmark to evaluate the scientific utility of LLMs under the **OSIR** (Open Scientific Intelligence & Reasoning) initiative. We score models based on:
-    - Information entropy & novelty
-    - Internal consistency
-    - Hypothesis framing
-    - Domain grounding & math logic
-    - Scientific utility (overall use to researchers)
-    This leaderboard includes Nexa's adapters and comparisons to general-purpose LLMs like GPT-4o, Claude 3, and open-source Mistral / LLaMA.
     """)
-    leaderboard_plot.render()
 demo.launch()

 import gradio as gr
 import plotly.graph_objects as go
+import json
+# Data for tabular models
+TABULAR_MODEL_EVALS = {
+    "Proteins": {
+        "Nexa Bio1 (Secondary)": 0.71,
+        "Porter6 (Secondary)": 0.8456,
+        "DeepCNF (Secondary)": 0.85,
+        "AlphaFold2 (Tertiary GDT-TS)": 0.924,
+        "Nexa Bio2 (Tertiary)": 0.90,
+    },
+    "Astro": {
+        "Nexa Astro": 0.97,
+        "Baseline CNN": 0.89,
+    },
+    "Materials": {
+        "Nexa Materials": 0.9999,
+        "Random Forest Baseline": 0.92,
+    },
+    "QST": {
+        "Nexa PIN Model": 0.80,
+        "Quantum TomoNet": 0.85,
+    },
+    "HEP": {
+        "Nexa HEP Model": 0.91,
+        "CMSNet": 0.94,
+    },
+    "CFD": {
+        "Nexa CFD Model": 0.92,
+        "FlowNet": 0.89,
+    },
+}
+# Data for LLMs
+LLM_MODEL_EVALS = {
     "LLM (General OSIR)": {
         "Nexa Mistral Sci-7B": 0.61,
         "Llama-3-8B-Instruct": 0.39,
     },
 }
+# Universal plotting function for horizontal bar charts
+def plot_horizontal_bar(domain, data, color):
+    sorted_items = sorted(data.items(), key=lambda x: x[1], reverse=True)
     models, scores = zip(*sorted_items)
     fig = go.Figure()
         x=scores,
         y=models,
         orientation='h',
+        marker_color=color,
     ))
     fig.update_layout(
+        title=f"Model Benchmark Scores — {domain}",
+        xaxis_title="Score",
         yaxis_title="Model",
         xaxis_range=[0, 1.0],
         template="plotly_white",
     )
     return fig
+# Display functions for each section
+def display_tabular_eval(domain):
+    if domain not in TABULAR_MODEL_EVALS:
+        return None, "Invalid domain selected"
+    plot = plot_horizontal_bar(domain, TABULAR_MODEL_EVALS[domain], 'indigo')
+    details = json.dumps(TABULAR_MODEL_EVALS[domain], indent=2)
+    return plot, details
+def display_llm_eval(domain):
+    if domain not in LLM_MODEL_EVALS:
+        return None, "Invalid domain selected"
+    plot = plot_horizontal_bar(domain, LLM_MODEL_EVALS[domain], 'lightblue')
+    details = json.dumps(LLM_MODEL_EVALS[domain], indent=2)
+    return plot, details
+# Gradio interface
 with gr.Blocks(css="body {font-family: 'Inter', sans-serif; background-color: #fafafa;}") as demo:
     gr.Markdown("""
+    # 🔬 Nexa Evals — Scientific ML Benchmark Suite
+    A comprehensive benchmarking suite comparing Nexa models against state-of-the-art models across scientific domains and language models.
     """)
+    with gr.Tabs():
+        with gr.TabItem("Tabular Models"):
+            with gr.Row():
+                tabular_domain = gr.Dropdown(
+                    choices=list(TABULAR_MODEL_EVALS.keys()),
+                    label="Select Domain",
+                    value="Proteins"
+                )
+                show_tabular_btn = gr.Button("Show Evaluation")
+            tabular_plot = gr.Plot(label="Benchmark Plot")
+            tabular_details = gr.Code(label="Raw Scores (JSON)", language="json")
+            show_tabular_btn.click(
+                fn=display_tabular_eval,
+                inputs=tabular_domain,
+                outputs=[tabular_plot, tabular_details]
+            )
+        with gr.TabItem("LLMs"):
+            with gr.Row():
+                llm_domain = gr.Dropdown(
+                    choices=list(LLM_MODEL_EVALS.keys()),
+                    label="Select Domain",
+                    value="LLM (General OSIR)"
+                )
+                show_llm_btn = gr.Button("Show Evaluation")
+            llm_plot = gr.Plot(label="Benchmark Plot")
+            llm_details = gr.Code(label="Raw Scores (JSON)", language="json")
+            show_llm_btn.click(
+                fn=display_llm_eval,
+                inputs=llm_domain,
+                outputs=[llm_plot, llm_details]
+            )
     gr.Markdown("""
     ---
     ### ℹ️ About
+    Nexa Evals provides benchmarks for both tabular models and language models in scientific domains:
+    - **Tabular Models**: Evaluated on domain-specific metrics (e.g., accuracy, GDT-TS) across fields like Proteins, Astro, Materials, QST, HEP, and CFD.
+    - **Language Models**: Assessed using the SciEval benchmark under the OSIR initiative, focusing on scientific utility, information entropy, internal consistency, hypothesis framing, domain grounding, and math logic.
+    Scores range from 0 to 1, with higher values indicating better performance. Models are sorted by score in descending order for easy comparison.
     """)
 demo.launch()