rzanoli commited on
Commit
a43032b
·
1 Parent(s): af23b14

Add the chart for Mean Accuracy Differenze (few-shot - zero-shot)

Browse files
Files changed (1) hide show
  1. app.py +109 -16
app.py CHANGED
@@ -46,9 +46,78 @@ def mean_of_max_per_field(df):
46
  return mean_max
47
 
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  def boxplot_per_task(dataframe=None, baselines=None):
50
 
51
- #print(dataframe.columns)
52
 
53
  tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
54
 
@@ -111,19 +180,20 @@ def boxplot_per_task(dataframe=None, baselines=None):
111
  dragmode=False,
112
  font=dict(family="Arial", size=13),
113
  margin=dict(b=140),
114
- annotations=[
115
- dict(
116
- text=(
117
- "Boxplots show LLM accuracy in zero/few-shot settings. Black dashed lines<br>"
118
- "indicate best-performing supervised models evaluated on EVALITA."
119
- ),
120
- xref="paper", yref="paper",
121
- x=0.5, y=-0.30,
122
- showarrow=False,
123
- font=dict(size=12, color="gray")
124
- )
125
- ]
126
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  fig.update_yaxes(range=[0, 100], fixedrange=True)
128
 
129
  return fig
@@ -192,13 +262,24 @@ def boxplot_prompts_per_task(dataframe, tasks=None):
192
  )
193
 
194
  fig.update_layout(
195
- title="Average Prompt Accuracy vs Best Prompt Accuracy per Task",
196
- xaxis_title="Task",
197
  yaxis_title="Combined Performance",
198
  barmode='group',
199
  template="plotly_white",
200
  font=dict(family="Arial", size=13),
201
- yaxis=dict(range=[0, 100], fixedrange=True)
 
 
 
 
 
 
 
 
 
 
 
202
  )
203
 
204
  return fig
@@ -260,6 +341,17 @@ def line_chart(dataframe):
260
  dragmode=False
261
  )
262
 
 
 
 
 
 
 
 
 
 
 
 
263
  # Disabilita lo zoom e altri controlli
264
  fig.update_xaxes(fixedrange=True, rangeslider_visible=False)
265
  fig.update_yaxes(fixedrange=True)
@@ -580,6 +672,7 @@ with demo:
580
  gr.Plot(value=line_chart(LEADERBOARD_DF))
581
  gr.Plot(value=boxplot_per_task(LEADERBOARD_DF, BASELINES))
582
  gr.Plot(value=boxplot_prompts_per_task(LEADERBOARD_DF))
 
583
 
584
  # About tab
585
  with gr.TabItem("📝 About"):
 
46
  return mean_max
47
 
48
 
49
+ def barplot_mean_few_minus_zero_shot(dataframe, tasks=None):
50
+ if tasks is None:
51
+ tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
52
+
53
+ task_means = {}
54
+
55
+ for task in tasks:
56
+ if task not in dataframe.columns:
57
+ continue
58
+
59
+ # Separa few-shot e zero-shot
60
+ few_shot = dataframe[dataframe['IS_FS'] == True][["Model", task]]
61
+ zero_shot = dataframe[dataframe['IS_FS'] == False][["Model", task]]
62
+
63
+ # Allinea i modelli
64
+ merged = pd.merge(few_shot, zero_shot, on="Model", suffixes=("_few", "_zero"))
65
+
66
+ # Rimuovi righe con valori mancanti
67
+ merged = merged.dropna(subset=[f"{task}_few", f"{task}_zero"])
68
+
69
+ if merged.empty:
70
+ continue
71
+
72
+ # Calcola differenza few - zero
73
+ diff = merged[f"{task}_few"] - merged[f"{task}_zero"]
74
+
75
+ # Calcola la media
76
+ task_means[task] = diff.mean()
77
+
78
+ # Crea barplot
79
+ fig = go.Figure([go.Bar(
80
+ x=list(task_means.keys()),
81
+ y=list(task_means.values()),
82
+ marker_color="#ff7f0e",
83
+ text=[f"{v:.2f}" for v in task_means.values()],
84
+ textposition="outside",
85
+ hovertemplate="<b>%{x}</b><br>Mean Delta Accuracy: %{y:.2f}%<extra></extra>"
86
+ )])
87
+
88
+ # Linea di riferimento a 0
89
+ fig.add_shape(
90
+ type="line",
91
+ x0=-0.5, x1=len(task_means) - 0.5,
92
+ y0=0, y1=0,
93
+ line=dict(color="black", width=2, dash="dash"),
94
+ xref="x", yref="y"
95
+ )
96
+
97
+ fig.update_layout(
98
+ title="Mean Accuracy Difference (Few-shot − Zero-shot) per Task",
99
+ xaxis_title="",
100
+ yaxis_title="Mean Delta Combined Performance",
101
+ template="plotly_white",
102
+ font=dict(family="Arial", size=13),
103
+ )
104
+
105
+ fig.add_annotation(
106
+ text="5-shot learning generally outperforms zero-shot, especially in tasks like NER and REL.<br>"
107
+ "Only in Summarization (SU) does it provide no accuracy gain.",
108
+ xref="paper", yref="paper",
109
+ x=0, y=-0.2,
110
+ showarrow=False,
111
+ font=dict(size=11, color="gray"),
112
+ align="left"
113
+ )
114
+
115
+ return fig
116
+
117
+
118
  def boxplot_per_task(dataframe=None, baselines=None):
119
 
120
+ print(dataframe.columns)
121
 
122
  tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
123
 
 
180
  dragmode=False,
181
  font=dict(family="Arial", size=13),
182
  margin=dict(b=140),
 
 
 
 
 
 
 
 
 
 
 
 
183
  )
184
+
185
+ fig.add_annotation(
186
+ text=(
187
+ "In zero/few-shot settings, models are getting closer to the supervised EVALITA baselines <br>"
188
+ "(black dashed line), except for NER and REL."
189
+ ),
190
+ xref="paper", yref="paper",
191
+ x=0.5, y=-0.30,
192
+ showarrow=False,
193
+ font=dict(size=12, color="gray"),
194
+ align="left"
195
+ )
196
+
197
  fig.update_yaxes(range=[0, 100], fixedrange=True)
198
 
199
  return fig
 
262
  )
263
 
264
  fig.update_layout(
265
+ title= "Average Prompt Accuracy vs Best Prompt Accuracy per Task",
266
+ xaxis_title="",
267
  yaxis_title="Combined Performance",
268
  barmode='group',
269
  template="plotly_white",
270
  font=dict(family="Arial", size=13),
271
+ yaxis=dict(range=[0, 100], fixedrange=True),
272
+ )
273
+
274
+ # Aggiungi la caption come annotazione separata
275
+ fig.add_annotation(
276
+ text="There is no single prompt that performs best across all tasks.<br>"
277
+ "Different prompts achieve the highest accuracy on different tasks.",
278
+ xref="paper", yref="paper",
279
+ x=0, y=-0.3,
280
+ showarrow=False,
281
+ font=dict(size=11, color="gray"),
282
+ align="left"
283
  )
284
 
285
  return fig
 
341
  dragmode=False
342
  )
343
 
344
+ # Aggiungi la caption come annotazione separata
345
+ fig.add_annotation(
346
+ text="Models with more parameters generally perform better than smaller ones. However, few-shot learning <br>"
347
+ "can sometimes enable smaller models to outperform larger models evaluated in zero-shot settings.",
348
+ xref="paper", yref="paper",
349
+ x=0, y=-0.3,
350
+ showarrow=False,
351
+ font=dict(size=11, color="gray"),
352
+ align="left"
353
+ )
354
+
355
  # Disabilita lo zoom e altri controlli
356
  fig.update_xaxes(fixedrange=True, rangeslider_visible=False)
357
  fig.update_yaxes(fixedrange=True)
 
672
  gr.Plot(value=line_chart(LEADERBOARD_DF))
673
  gr.Plot(value=boxplot_per_task(LEADERBOARD_DF, BASELINES))
674
  gr.Plot(value=boxplot_prompts_per_task(LEADERBOARD_DF))
675
+ gr.Plot(value=barplot_mean_few_minus_zero_shot(LEADERBOARD_DF))
676
 
677
  # About tab
678
  with gr.TabItem("📝 About"):