Spaces:

Allanatrix
/

NexaEvals

Running

App Files Files Community

Allanatrix commited on 19 days ago

Commit

94c2f22

verified ·

1 Parent(s): b0ad3dc

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -12

app.py CHANGED Viewed

@@ -53,17 +53,31 @@ LLM_MODEL_EVALS = {
     },
 }
-# Universal plotting function for horizontal bar charts
-def plot_horizontal_bar(domain, data, color):
     sorted_items = sorted(data.items(), key=lambda x: x[1], reverse=True)
     models, scores = zip(*sorted_items)
     fig = go.Figure()
     fig.add_trace(go.Bar(
         x=scores,
         y=models,
         orientation='h',
-        marker_color=color,
     ))
     fig.update_layout(
@@ -74,6 +88,36 @@ def plot_horizontal_bar(domain, data, color):
         template="plotly_white",
         height=500,
         margin=dict(l=120, r=20, t=40, b=40),
     )
     return fig
@@ -81,22 +125,27 @@ def plot_horizontal_bar(domain, data, color):
 def display_tabular_eval(domain):
     if domain not in TABULAR_MODEL_EVALS:
         return None, "Invalid domain selected"
-    plot = plot_horizontal_bar(domain, TABULAR_MODEL_EVALS[domain], 'indigo')
     details = json.dumps(TABULAR_MODEL_EVALS[domain], indent=2)
     return plot, details
 def display_llm_eval(domain):
     if domain not in LLM_MODEL_EVALS:
         return None, "Invalid domain selected"
-    plot = plot_horizontal_bar(domain, LLM_MODEL_EVALS[domain], 'lightblue')
     details = json.dumps(LLM_MODEL_EVALS[domain], indent=2)
     return plot, details
-# Gradio interface
-with gr.Blocks(css="body {font-family: 'Inter', sans-serif; background-color: #fafafa;}") as demo:
     gr.Markdown("""
     # 🔬 Nexa Evals — Scientific ML Benchmark Suite
-    A comprehensive benchmarking suite comparing Nexa models against state-of-the-art models across scientific domains and language models.
     """)
     with gr.Tabs():
@@ -132,13 +181,30 @@ with gr.Blocks(css="body {font-family: 'Inter', sans-serif; background-color: #f
                 outputs=[llm_plot, llm_details]
             )
     gr.Markdown("""
     ---
     ### ℹ️ About
-    Nexa Evals provides benchmarks for both tabular models and language models in scientific domains:
-    - **Tabular Models**: Evaluated on domain-specific metrics (e.g., accuracy, GDT-TS) across fields like Proteins, Astro, Materials, QST, HEP, and CFD.
-    - **Language Models**: Assessed using the SciEval benchmark under the OSIR initiative, focusing on scientific utility, information entropy, internal consistency, hypothesis framing, domain grounding, and math logic.
-    Scores range from 0 to 1, with higher values indicating better performance. Models are sorted by score in descending order for easy comparison.
     """)
 demo.launch()

     },
 }
+# Data for Nexa Mistral Sci-7B Evaluation (based on the provided image)
+NEXA_MISTRAL_EVALS = {
+    "Nexa Mistral Sci-7B": {
+        "Scientific Utility": {"OSIR (General)": 7.0, "OSIR-Field (Physics)": 8.5},
+        "Symbolism & Math Logic": {"OSIR (General)": 6.0, "OSIR-Field (Physics)": 7.5},
+        "Citation & Structure": {"OSIR (General)": 5.5, "OSIR-Field (Physics)": 6.0},
+        "Thematic Grounding": {"OSIR (General)": 7.0, "OSIR-Field (Physics)": 8.0},
+        "Hypothesis Framing": {"OSIR (General)": 6.0, "OSIR-Field (Physics)": 7.0},
+        "Internal Consistency": {"OSIR (General)": 9.0, "OSIR-Field (Physics)": 9.5},
+        "Entropy / Novelty": {"OSIR (General)": 6.5, "OSIR-Field (Physics)": 6.0},
+    }
+}
+# Universal plotting function with highlighted Nexa models
+def plot_horizontal_bar(domain, data, highlight_keyword="Nexa", highlight_color='indigo', default_color='lightgray'):
     sorted_items = sorted(data.items(), key=lambda x: x[1], reverse=True)
     models, scores = zip(*sorted_items)
+    colors = [highlight_color if highlight_keyword in model else default_color for model in models]
     fig = go.Figure()
     fig.add_trace(go.Bar(
         x=scores,
         y=models,
         orientation='h',
+        marker_color=colors,
     ))
     fig.update_layout(
         template="plotly_white",
         height=500,
         margin=dict(l=120, r=20, t=40, b=40),
+        yaxis=dict(automargin=True),
+    )
+    return fig
+# Plotting function for Nexa Mistral Sci-7B Evaluation
+def plot_mistral_eval(metric):
+    if metric not in NEXA_MISTRAL_EVALS["Nexa Mistral Sci-7B"]:
+        return None, "Invalid metric selected"
+    data = NEXA_MISTRAL_EVALS["Nexa Mistral Sci-7B"][metric]
+    models = list(data.keys())
+    scores = list(data.values())
+    fig = go.Figure()
+    fig.add_trace(go.Bar(
+        x=scores,
+        y=models,
+        orientation='h',
+        marker_color=['yellow', 'orange']  # Matching the provided image colors
+    ))
+    fig.update_layout(
+        title=f"Nexa Mistral Sci-7B Evaluation: {metric}",
+        xaxis_title="Score (1-10)",
+        yaxis_title="Model",
+        xaxis_range=[0, 10],
+        template="plotly_white",
+        height=400,
+        margin=dict(l=120, r=20, t=40, b=40),
+        yaxis=dict(automargin=True),
+        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
     )
     return fig
 def display_tabular_eval(domain):
     if domain not in TABULAR_MODEL_EVALS:
         return None, "Invalid domain selected"
+    plot = plot_horizontal_bar(domain, TABULAR_MODEL_EVALS[domain], highlight_color='indigo', default_color='lightgray')
     details = json.dumps(TABULAR_MODEL_EVALS[domain], indent=2)
     return plot, details
 def display_llm_eval(domain):
     if domain not in LLM_MODEL_EVALS:
         return None, "Invalid domain selected"
+    plot = plot_horizontal_bar(domain, LLM_MODEL_EVALS[domain], highlight_color='lightblue', default_color='gray')
     details = json.dumps(LLM_MODEL_EVALS[domain], indent=2)
     return plot, details
+def display_mistral_eval(metric):
+    plot = plot_mistral_eval(metric)
+    details = json.dumps(NEXA_MISTRAL_EVALS["Nexa Mistral Sci-7B"][metric], indent=2)
+    return plot, details
+# Gradio interface with improved styling
+with gr.Blocks(css="body {font-family: 'Inter', sans-serif; background-color: #f0f0f0; color: #333;}") as demo:
     gr.Markdown("""
     # 🔬 Nexa Evals — Scientific ML Benchmark Suite
+    A comprehensive benchmarking suite comparing Nexa models against state-of-the-art models.
     """)
     with gr.Tabs():
                 outputs=[llm_plot, llm_details]
             )
+        with gr.TabItem("Nexa Mistral Sci-7B"):
+            with gr.Row():
+                mistral_metric = gr.Dropdown(
+                    choices=list(NEXA_MISTRAL_EVALS["Nexa Mistral Sci-7B"].keys()),
+                    label="Select Metric",
+                    value="Scientific Utility"
+                )
+                show_mistral_btn = gr.Button("Show Evaluation")
+            mistral_plot = gr.Plot(label="Benchmark Plot")
+            mistral_details = gr.Code(label="Raw Scores (JSON)", language="json")
+            show_mistral_btn.click(
+                fn=display_mistral_eval,
+                inputs=mistral_metric,
+                outputs=[mistral_plot, mistral_details]
+            )
     gr.Markdown("""
     ---
     ### ℹ️ About
+    Nexa Evals provides benchmarks for tabular models, language models, and specific evaluations like Nexa Mistral Sci-7B:
+    - **Tabular Models**: Evaluated on domain-specific metrics across fields like Proteins and Astro.
+    - **LLMs**: Assessed using the SciEval benchmark under the OSIR initiative.
+    - **Nexa Mistral Sci-7B**: Compares general (OSIR) and physics-specific (OSIR-Field) performance across multiple metrics.
+    Scores are normalized where applicable (0-1 for tabular/LLMs, 1-10 for Mistral).
     """)
 demo.launch()