Allanatrix commited on
Commit
94c2f22
·
verified ·
1 Parent(s): b0ad3dc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -12
app.py CHANGED
@@ -53,17 +53,31 @@ LLM_MODEL_EVALS = {
53
  },
54
  }
55
 
56
- # Universal plotting function for horizontal bar charts
57
- def plot_horizontal_bar(domain, data, color):
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  sorted_items = sorted(data.items(), key=lambda x: x[1], reverse=True)
59
  models, scores = zip(*sorted_items)
 
60
 
61
  fig = go.Figure()
62
  fig.add_trace(go.Bar(
63
  x=scores,
64
  y=models,
65
  orientation='h',
66
- marker_color=color,
67
  ))
68
 
69
  fig.update_layout(
@@ -74,6 +88,36 @@ def plot_horizontal_bar(domain, data, color):
74
  template="plotly_white",
75
  height=500,
76
  margin=dict(l=120, r=20, t=40, b=40),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  )
78
  return fig
79
 
@@ -81,22 +125,27 @@ def plot_horizontal_bar(domain, data, color):
81
  def display_tabular_eval(domain):
82
  if domain not in TABULAR_MODEL_EVALS:
83
  return None, "Invalid domain selected"
84
- plot = plot_horizontal_bar(domain, TABULAR_MODEL_EVALS[domain], 'indigo')
85
  details = json.dumps(TABULAR_MODEL_EVALS[domain], indent=2)
86
  return plot, details
87
 
88
  def display_llm_eval(domain):
89
  if domain not in LLM_MODEL_EVALS:
90
  return None, "Invalid domain selected"
91
- plot = plot_horizontal_bar(domain, LLM_MODEL_EVALS[domain], 'lightblue')
92
  details = json.dumps(LLM_MODEL_EVALS[domain], indent=2)
93
  return plot, details
94
 
95
- # Gradio interface
96
- with gr.Blocks(css="body {font-family: 'Inter', sans-serif; background-color: #fafafa;}") as demo:
 
 
 
 
 
97
  gr.Markdown("""
98
  # 🔬 Nexa Evals — Scientific ML Benchmark Suite
99
- A comprehensive benchmarking suite comparing Nexa models against state-of-the-art models across scientific domains and language models.
100
  """)
101
 
102
  with gr.Tabs():
@@ -132,13 +181,30 @@ with gr.Blocks(css="body {font-family: 'Inter', sans-serif; background-color: #f
132
  outputs=[llm_plot, llm_details]
133
  )
134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  gr.Markdown("""
136
  ---
137
  ### ℹ️ About
138
- Nexa Evals provides benchmarks for both tabular models and language models in scientific domains:
139
- - **Tabular Models**: Evaluated on domain-specific metrics (e.g., accuracy, GDT-TS) across fields like Proteins, Astro, Materials, QST, HEP, and CFD.
140
- - **Language Models**: Assessed using the SciEval benchmark under the OSIR initiative, focusing on scientific utility, information entropy, internal consistency, hypothesis framing, domain grounding, and math logic.
141
- Scores range from 0 to 1, with higher values indicating better performance. Models are sorted by score in descending order for easy comparison.
 
142
  """)
143
 
144
  demo.launch()
 
53
  },
54
  }
55
 
56
+ # Data for Nexa Mistral Sci-7B Evaluation (based on the provided image)
57
+ NEXA_MISTRAL_EVALS = {
58
+ "Nexa Mistral Sci-7B": {
59
+ "Scientific Utility": {"OSIR (General)": 7.0, "OSIR-Field (Physics)": 8.5},
60
+ "Symbolism & Math Logic": {"OSIR (General)": 6.0, "OSIR-Field (Physics)": 7.5},
61
+ "Citation & Structure": {"OSIR (General)": 5.5, "OSIR-Field (Physics)": 6.0},
62
+ "Thematic Grounding": {"OSIR (General)": 7.0, "OSIR-Field (Physics)": 8.0},
63
+ "Hypothesis Framing": {"OSIR (General)": 6.0, "OSIR-Field (Physics)": 7.0},
64
+ "Internal Consistency": {"OSIR (General)": 9.0, "OSIR-Field (Physics)": 9.5},
65
+ "Entropy / Novelty": {"OSIR (General)": 6.5, "OSIR-Field (Physics)": 6.0},
66
+ }
67
+ }
68
+
69
+ # Universal plotting function with highlighted Nexa models
70
+ def plot_horizontal_bar(domain, data, highlight_keyword="Nexa", highlight_color='indigo', default_color='lightgray'):
71
  sorted_items = sorted(data.items(), key=lambda x: x[1], reverse=True)
72
  models, scores = zip(*sorted_items)
73
+ colors = [highlight_color if highlight_keyword in model else default_color for model in models]
74
 
75
  fig = go.Figure()
76
  fig.add_trace(go.Bar(
77
  x=scores,
78
  y=models,
79
  orientation='h',
80
+ marker_color=colors,
81
  ))
82
 
83
  fig.update_layout(
 
88
  template="plotly_white",
89
  height=500,
90
  margin=dict(l=120, r=20, t=40, b=40),
91
+ yaxis=dict(automargin=True),
92
+ )
93
+ return fig
94
+
95
+ # Plotting function for Nexa Mistral Sci-7B Evaluation
96
+ def plot_mistral_eval(metric):
97
+ if metric not in NEXA_MISTRAL_EVALS["Nexa Mistral Sci-7B"]:
98
+ return None, "Invalid metric selected"
99
+ data = NEXA_MISTRAL_EVALS["Nexa Mistral Sci-7B"][metric]
100
+ models = list(data.keys())
101
+ scores = list(data.values())
102
+
103
+ fig = go.Figure()
104
+ fig.add_trace(go.Bar(
105
+ x=scores,
106
+ y=models,
107
+ orientation='h',
108
+ marker_color=['yellow', 'orange'] # Matching the provided image colors
109
+ ))
110
+
111
+ fig.update_layout(
112
+ title=f"Nexa Mistral Sci-7B Evaluation: {metric}",
113
+ xaxis_title="Score (1-10)",
114
+ yaxis_title="Model",
115
+ xaxis_range=[0, 10],
116
+ template="plotly_white",
117
+ height=400,
118
+ margin=dict(l=120, r=20, t=40, b=40),
119
+ yaxis=dict(automargin=True),
120
+ legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
121
  )
122
  return fig
123
 
 
125
  def display_tabular_eval(domain):
126
  if domain not in TABULAR_MODEL_EVALS:
127
  return None, "Invalid domain selected"
128
+ plot = plot_horizontal_bar(domain, TABULAR_MODEL_EVALS[domain], highlight_color='indigo', default_color='lightgray')
129
  details = json.dumps(TABULAR_MODEL_EVALS[domain], indent=2)
130
  return plot, details
131
 
132
  def display_llm_eval(domain):
133
  if domain not in LLM_MODEL_EVALS:
134
  return None, "Invalid domain selected"
135
+ plot = plot_horizontal_bar(domain, LLM_MODEL_EVALS[domain], highlight_color='lightblue', default_color='gray')
136
  details = json.dumps(LLM_MODEL_EVALS[domain], indent=2)
137
  return plot, details
138
 
139
+ def display_mistral_eval(metric):
140
+ plot = plot_mistral_eval(metric)
141
+ details = json.dumps(NEXA_MISTRAL_EVALS["Nexa Mistral Sci-7B"][metric], indent=2)
142
+ return plot, details
143
+
144
+ # Gradio interface with improved styling
145
+ with gr.Blocks(css="body {font-family: 'Inter', sans-serif; background-color: #f0f0f0; color: #333;}") as demo:
146
  gr.Markdown("""
147
  # 🔬 Nexa Evals — Scientific ML Benchmark Suite
148
+ A comprehensive benchmarking suite comparing Nexa models against state-of-the-art models.
149
  """)
150
 
151
  with gr.Tabs():
 
181
  outputs=[llm_plot, llm_details]
182
  )
183
 
184
+ with gr.TabItem("Nexa Mistral Sci-7B"):
185
+ with gr.Row():
186
+ mistral_metric = gr.Dropdown(
187
+ choices=list(NEXA_MISTRAL_EVALS["Nexa Mistral Sci-7B"].keys()),
188
+ label="Select Metric",
189
+ value="Scientific Utility"
190
+ )
191
+ show_mistral_btn = gr.Button("Show Evaluation")
192
+ mistral_plot = gr.Plot(label="Benchmark Plot")
193
+ mistral_details = gr.Code(label="Raw Scores (JSON)", language="json")
194
+ show_mistral_btn.click(
195
+ fn=display_mistral_eval,
196
+ inputs=mistral_metric,
197
+ outputs=[mistral_plot, mistral_details]
198
+ )
199
+
200
  gr.Markdown("""
201
  ---
202
  ### ℹ️ About
203
+ Nexa Evals provides benchmarks for tabular models, language models, and specific evaluations like Nexa Mistral Sci-7B:
204
+ - **Tabular Models**: Evaluated on domain-specific metrics across fields like Proteins and Astro.
205
+ - **LLMs**: Assessed using the SciEval benchmark under the OSIR initiative.
206
+ - **Nexa Mistral Sci-7B**: Compares general (OSIR) and physics-specific (OSIR-Field) performance across multiple metrics.
207
+ Scores are normalized where applicable (0-1 for tabular/LLMs, 1-10 for Mistral).
208
  """)
209
 
210
  demo.launch()