evalita_llm_leaderboard

Running

App Files Files Community

rzanoli commited on Aug 26

Commit

a43032b

1 Parent(s): af23b14

Add the chart for Mean Accuracy Differenze (few-shot - zero-shot)

Browse files

Files changed (1) hide show

app.py +109 -16

app.py CHANGED Viewed

@@ -46,9 +46,78 @@ def mean_of_max_per_field(df):
     return mean_max
 def boxplot_per_task(dataframe=None, baselines=None):
-    #print(dataframe.columns)
     tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
@@ -111,19 +180,20 @@ def boxplot_per_task(dataframe=None, baselines=None):
         dragmode=False,
         font=dict(family="Arial", size=13),
         margin=dict(b=140),
-        annotations=[
-            dict(
-                text=(
-                    "Boxplots show LLM accuracy in zero/few-shot settings. Black dashed lines<br>"
-                    "indicate best-performing supervised models evaluated on EVALITA."
-                ),
-                xref="paper", yref="paper",
-                x=0.5, y=-0.30,
-                showarrow=False,
-                font=dict(size=12, color="gray")
-            )
-        ]
     )
     fig.update_yaxes(range=[0, 100], fixedrange=True)
     return fig
@@ -192,13 +262,24 @@ def boxplot_prompts_per_task(dataframe, tasks=None):
         )
     fig.update_layout(
-        title="Average Prompt Accuracy vs Best Prompt Accuracy per Task",
-        xaxis_title="Task",
         yaxis_title="Combined Performance",
         barmode='group',
         template="plotly_white",
         font=dict(family="Arial", size=13),
-        yaxis=dict(range=[0, 100], fixedrange=True)
     )
     return fig
@@ -260,6 +341,17 @@ def line_chart(dataframe):
         dragmode=False
     )
     # Disabilita lo zoom e altri controlli
     fig.update_xaxes(fixedrange=True, rangeslider_visible=False)
     fig.update_yaxes(fixedrange=True)
@@ -580,6 +672,7 @@ with demo:
             gr.Plot(value=line_chart(LEADERBOARD_DF))
             gr.Plot(value=boxplot_per_task(LEADERBOARD_DF, BASELINES))
             gr.Plot(value=boxplot_prompts_per_task(LEADERBOARD_DF))
         # About tab
         with gr.TabItem("📝 About"):

     return mean_max
+def barplot_mean_few_minus_zero_shot(dataframe, tasks=None):
+    if tasks is None:
+        tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
+    task_means = {}
+    for task in tasks:
+        if task not in dataframe.columns:
+            continue
+        # Separa few-shot e zero-shot
+        few_shot = dataframe[dataframe['IS_FS'] == True][["Model", task]]
+        zero_shot = dataframe[dataframe['IS_FS'] == False][["Model", task]]
+        # Allinea i modelli
+        merged = pd.merge(few_shot, zero_shot, on="Model", suffixes=("_few", "_zero"))
+        # Rimuovi righe con valori mancanti
+        merged = merged.dropna(subset=[f"{task}_few", f"{task}_zero"])
+        if merged.empty:
+            continue
+        # Calcola differenza few - zero
+        diff = merged[f"{task}_few"] - merged[f"{task}_zero"]
+        # Calcola la media
+        task_means[task] = diff.mean()
+    # Crea barplot
+    fig = go.Figure([go.Bar(
+        x=list(task_means.keys()),
+        y=list(task_means.values()),
+        marker_color="#ff7f0e",
+        text=[f"{v:.2f}" for v in task_means.values()],
+        textposition="outside",
+        hovertemplate="<b>%{x}</b><br>Mean Delta Accuracy: %{y:.2f}%<extra></extra>"
+    )])
+    # Linea di riferimento a 0
+    fig.add_shape(
+        type="line",
+        x0=-0.5, x1=len(task_means) - 0.5,
+        y0=0, y1=0,
+        line=dict(color="black", width=2, dash="dash"),
+        xref="x", yref="y"
+    )
+    fig.update_layout(
+        title="Mean Accuracy Difference (Few-shot − Zero-shot) per Task",
+        xaxis_title="",
+        yaxis_title="Mean Delta Combined Performance",
+        template="plotly_white",
+        font=dict(family="Arial", size=13),
+    )
+    fig.add_annotation(
+        text="5-shot learning generally outperforms zero-shot, especially in tasks like NER and REL.<br>"
+             "Only in Summarization (SU) does it provide no accuracy gain.",
+        xref="paper", yref="paper",
+        x=0, y=-0.2,
+        showarrow=False,
+        font=dict(size=11, color="gray"),
+        align="left"
+    )
+    return fig
 def boxplot_per_task(dataframe=None, baselines=None):
+    print(dataframe.columns)
     tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
         dragmode=False,
         font=dict(family="Arial", size=13),
         margin=dict(b=140),
     )
+    fig.add_annotation(
+        text=(
+            "In zero/few-shot settings, models are getting closer to the supervised EVALITA baselines <br>"
+            "(black dashed line), except for NER and REL."
+        ),
+        xref="paper", yref="paper",
+        x=0.5, y=-0.30,
+        showarrow=False,
+        font=dict(size=12, color="gray"),
+        align="left"
+    )
     fig.update_yaxes(range=[0, 100], fixedrange=True)
     return fig
         )
     fig.update_layout(
+        title= "Average Prompt Accuracy vs Best Prompt Accuracy per Task",
+        xaxis_title="",
         yaxis_title="Combined Performance",
         barmode='group',
         template="plotly_white",
         font=dict(family="Arial", size=13),
+        yaxis=dict(range=[0, 100], fixedrange=True),
+    )
+    # Aggiungi la caption come annotazione separata
+    fig.add_annotation(
+        text="There is no single prompt that performs best across all tasks.<br>"
+             "Different prompts achieve the highest accuracy on different tasks.",
+        xref="paper", yref="paper",
+        x=0, y=-0.3,
+        showarrow=False,
+        font=dict(size=11, color="gray"),
+        align="left"
     )
     return fig
         dragmode=False
     )
+    # Aggiungi la caption come annotazione separata
+    fig.add_annotation(
+        text="Models with more parameters generally perform better than smaller ones. However, few-shot learning <br>"
+        "can sometimes enable smaller models to outperform larger models evaluated in zero-shot settings.",
+        xref="paper", yref="paper",
+        x=0, y=-0.3,
+        showarrow=False,
+        font=dict(size=11, color="gray"),
+        align="left"
+    )
     # Disabilita lo zoom e altri controlli
     fig.update_xaxes(fixedrange=True, rangeslider_visible=False)
     fig.update_yaxes(fixedrange=True)
             gr.Plot(value=line_chart(LEADERBOARD_DF))
             gr.Plot(value=boxplot_per_task(LEADERBOARD_DF, BASELINES))
             gr.Plot(value=boxplot_prompts_per_task(LEADERBOARD_DF))
+            gr.Plot(value=barplot_mean_few_minus_zero_shot(LEADERBOARD_DF))
         # About tab
         with gr.TabItem("📝 About"):