Add the chart for Mean Accuracy Differenze (few-shot - zero-shot)
Browse files
app.py
CHANGED
|
@@ -46,9 +46,78 @@ def mean_of_max_per_field(df):
|
|
| 46 |
return mean_max
|
| 47 |
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
def boxplot_per_task(dataframe=None, baselines=None):
|
| 50 |
|
| 51 |
-
|
| 52 |
|
| 53 |
tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
|
| 54 |
|
|
@@ -111,19 +180,20 @@ def boxplot_per_task(dataframe=None, baselines=None):
|
|
| 111 |
dragmode=False,
|
| 112 |
font=dict(family="Arial", size=13),
|
| 113 |
margin=dict(b=140),
|
| 114 |
-
annotations=[
|
| 115 |
-
dict(
|
| 116 |
-
text=(
|
| 117 |
-
"Boxplots show LLM accuracy in zero/few-shot settings. Black dashed lines<br>"
|
| 118 |
-
"indicate best-performing supervised models evaluated on EVALITA."
|
| 119 |
-
),
|
| 120 |
-
xref="paper", yref="paper",
|
| 121 |
-
x=0.5, y=-0.30,
|
| 122 |
-
showarrow=False,
|
| 123 |
-
font=dict(size=12, color="gray")
|
| 124 |
-
)
|
| 125 |
-
]
|
| 126 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
fig.update_yaxes(range=[0, 100], fixedrange=True)
|
| 128 |
|
| 129 |
return fig
|
|
@@ -192,13 +262,24 @@ def boxplot_prompts_per_task(dataframe, tasks=None):
|
|
| 192 |
)
|
| 193 |
|
| 194 |
fig.update_layout(
|
| 195 |
-
title="Average Prompt Accuracy vs Best Prompt Accuracy per Task",
|
| 196 |
-
xaxis_title="
|
| 197 |
yaxis_title="Combined Performance",
|
| 198 |
barmode='group',
|
| 199 |
template="plotly_white",
|
| 200 |
font=dict(family="Arial", size=13),
|
| 201 |
-
yaxis=dict(range=[0, 100], fixedrange=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
)
|
| 203 |
|
| 204 |
return fig
|
|
@@ -260,6 +341,17 @@ def line_chart(dataframe):
|
|
| 260 |
dragmode=False
|
| 261 |
)
|
| 262 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
# Disabilita lo zoom e altri controlli
|
| 264 |
fig.update_xaxes(fixedrange=True, rangeslider_visible=False)
|
| 265 |
fig.update_yaxes(fixedrange=True)
|
|
@@ -580,6 +672,7 @@ with demo:
|
|
| 580 |
gr.Plot(value=line_chart(LEADERBOARD_DF))
|
| 581 |
gr.Plot(value=boxplot_per_task(LEADERBOARD_DF, BASELINES))
|
| 582 |
gr.Plot(value=boxplot_prompts_per_task(LEADERBOARD_DF))
|
|
|
|
| 583 |
|
| 584 |
# About tab
|
| 585 |
with gr.TabItem("📝 About"):
|
|
|
|
| 46 |
return mean_max
|
| 47 |
|
| 48 |
|
| 49 |
+
def barplot_mean_few_minus_zero_shot(dataframe, tasks=None):
|
| 50 |
+
if tasks is None:
|
| 51 |
+
tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
|
| 52 |
+
|
| 53 |
+
task_means = {}
|
| 54 |
+
|
| 55 |
+
for task in tasks:
|
| 56 |
+
if task not in dataframe.columns:
|
| 57 |
+
continue
|
| 58 |
+
|
| 59 |
+
# Separa few-shot e zero-shot
|
| 60 |
+
few_shot = dataframe[dataframe['IS_FS'] == True][["Model", task]]
|
| 61 |
+
zero_shot = dataframe[dataframe['IS_FS'] == False][["Model", task]]
|
| 62 |
+
|
| 63 |
+
# Allinea i modelli
|
| 64 |
+
merged = pd.merge(few_shot, zero_shot, on="Model", suffixes=("_few", "_zero"))
|
| 65 |
+
|
| 66 |
+
# Rimuovi righe con valori mancanti
|
| 67 |
+
merged = merged.dropna(subset=[f"{task}_few", f"{task}_zero"])
|
| 68 |
+
|
| 69 |
+
if merged.empty:
|
| 70 |
+
continue
|
| 71 |
+
|
| 72 |
+
# Calcola differenza few - zero
|
| 73 |
+
diff = merged[f"{task}_few"] - merged[f"{task}_zero"]
|
| 74 |
+
|
| 75 |
+
# Calcola la media
|
| 76 |
+
task_means[task] = diff.mean()
|
| 77 |
+
|
| 78 |
+
# Crea barplot
|
| 79 |
+
fig = go.Figure([go.Bar(
|
| 80 |
+
x=list(task_means.keys()),
|
| 81 |
+
y=list(task_means.values()),
|
| 82 |
+
marker_color="#ff7f0e",
|
| 83 |
+
text=[f"{v:.2f}" for v in task_means.values()],
|
| 84 |
+
textposition="outside",
|
| 85 |
+
hovertemplate="<b>%{x}</b><br>Mean Delta Accuracy: %{y:.2f}%<extra></extra>"
|
| 86 |
+
)])
|
| 87 |
+
|
| 88 |
+
# Linea di riferimento a 0
|
| 89 |
+
fig.add_shape(
|
| 90 |
+
type="line",
|
| 91 |
+
x0=-0.5, x1=len(task_means) - 0.5,
|
| 92 |
+
y0=0, y1=0,
|
| 93 |
+
line=dict(color="black", width=2, dash="dash"),
|
| 94 |
+
xref="x", yref="y"
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
fig.update_layout(
|
| 98 |
+
title="Mean Accuracy Difference (Few-shot − Zero-shot) per Task",
|
| 99 |
+
xaxis_title="",
|
| 100 |
+
yaxis_title="Mean Delta Combined Performance",
|
| 101 |
+
template="plotly_white",
|
| 102 |
+
font=dict(family="Arial", size=13),
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
fig.add_annotation(
|
| 106 |
+
text="5-shot learning generally outperforms zero-shot, especially in tasks like NER and REL.<br>"
|
| 107 |
+
"Only in Summarization (SU) does it provide no accuracy gain.",
|
| 108 |
+
xref="paper", yref="paper",
|
| 109 |
+
x=0, y=-0.2,
|
| 110 |
+
showarrow=False,
|
| 111 |
+
font=dict(size=11, color="gray"),
|
| 112 |
+
align="left"
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
return fig
|
| 116 |
+
|
| 117 |
+
|
| 118 |
def boxplot_per_task(dataframe=None, baselines=None):
|
| 119 |
|
| 120 |
+
print(dataframe.columns)
|
| 121 |
|
| 122 |
tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
|
| 123 |
|
|
|
|
| 180 |
dragmode=False,
|
| 181 |
font=dict(family="Arial", size=13),
|
| 182 |
margin=dict(b=140),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
)
|
| 184 |
+
|
| 185 |
+
fig.add_annotation(
|
| 186 |
+
text=(
|
| 187 |
+
"In zero/few-shot settings, models are getting closer to the supervised EVALITA baselines <br>"
|
| 188 |
+
"(black dashed line), except for NER and REL."
|
| 189 |
+
),
|
| 190 |
+
xref="paper", yref="paper",
|
| 191 |
+
x=0.5, y=-0.30,
|
| 192 |
+
showarrow=False,
|
| 193 |
+
font=dict(size=12, color="gray"),
|
| 194 |
+
align="left"
|
| 195 |
+
)
|
| 196 |
+
|
| 197 |
fig.update_yaxes(range=[0, 100], fixedrange=True)
|
| 198 |
|
| 199 |
return fig
|
|
|
|
| 262 |
)
|
| 263 |
|
| 264 |
fig.update_layout(
|
| 265 |
+
title= "Average Prompt Accuracy vs Best Prompt Accuracy per Task",
|
| 266 |
+
xaxis_title="",
|
| 267 |
yaxis_title="Combined Performance",
|
| 268 |
barmode='group',
|
| 269 |
template="plotly_white",
|
| 270 |
font=dict(family="Arial", size=13),
|
| 271 |
+
yaxis=dict(range=[0, 100], fixedrange=True),
|
| 272 |
+
)
|
| 273 |
+
|
| 274 |
+
# Aggiungi la caption come annotazione separata
|
| 275 |
+
fig.add_annotation(
|
| 276 |
+
text="There is no single prompt that performs best across all tasks.<br>"
|
| 277 |
+
"Different prompts achieve the highest accuracy on different tasks.",
|
| 278 |
+
xref="paper", yref="paper",
|
| 279 |
+
x=0, y=-0.3,
|
| 280 |
+
showarrow=False,
|
| 281 |
+
font=dict(size=11, color="gray"),
|
| 282 |
+
align="left"
|
| 283 |
)
|
| 284 |
|
| 285 |
return fig
|
|
|
|
| 341 |
dragmode=False
|
| 342 |
)
|
| 343 |
|
| 344 |
+
# Aggiungi la caption come annotazione separata
|
| 345 |
+
fig.add_annotation(
|
| 346 |
+
text="Models with more parameters generally perform better than smaller ones. However, few-shot learning <br>"
|
| 347 |
+
"can sometimes enable smaller models to outperform larger models evaluated in zero-shot settings.",
|
| 348 |
+
xref="paper", yref="paper",
|
| 349 |
+
x=0, y=-0.3,
|
| 350 |
+
showarrow=False,
|
| 351 |
+
font=dict(size=11, color="gray"),
|
| 352 |
+
align="left"
|
| 353 |
+
)
|
| 354 |
+
|
| 355 |
# Disabilita lo zoom e altri controlli
|
| 356 |
fig.update_xaxes(fixedrange=True, rangeslider_visible=False)
|
| 357 |
fig.update_yaxes(fixedrange=True)
|
|
|
|
| 672 |
gr.Plot(value=line_chart(LEADERBOARD_DF))
|
| 673 |
gr.Plot(value=boxplot_per_task(LEADERBOARD_DF, BASELINES))
|
| 674 |
gr.Plot(value=boxplot_prompts_per_task(LEADERBOARD_DF))
|
| 675 |
+
gr.Plot(value=barplot_mean_few_minus_zero_shot(LEADERBOARD_DF))
|
| 676 |
|
| 677 |
# About tab
|
| 678 |
with gr.TabItem("📝 About"):
|