Spaces:
Running
Running
import json | |
import plotly.express as px | |
from utils.pareto import Agent, compute_pareto_frontier | |
import plotly.graph_objects as go | |
import textwrap | |
import numpy as np | |
import pandas as pd | |
from scipy import stats | |
def create_leaderboard(df, ci_metrics = None): | |
# cast dtypes to string | |
df = df.astype(str) | |
# for each metric join metric and metric CI columns | |
if ci_metrics: | |
for metric in ci_metrics: | |
CI_metric = metric + ' CI' | |
# for rows in the df for which CI metric is not None, join the metric and CI columns by looping through the CI metrics columns | |
for i, row in df.iterrows(): | |
if str(row[CI_metric]) != 'None': | |
df.at[i, metric] = str(round(float(row[metric]), 2)) + " (" + str(row[CI_metric]) + ")" | |
return df | |
def create_task_success_heatmap(df, benchmark_name): | |
# Calculate agent accuracy | |
agent_accuracy = df.groupby('Agent Name')['Success'].mean().sort_values(ascending=False) | |
# Calculate task success rate | |
task_success_rate = df.groupby('Task ID')['Success'].mean().sort_values(ascending=False) | |
# Pivot the dataframe to create a matrix of agents vs tasks | |
pivot_df = df.pivot(index='Agent Name', columns='Task ID', values='Success') | |
# Sort the pivot table | |
pivot_df = pivot_df.reindex(index=agent_accuracy.index, columns=task_success_rate.index) | |
# Calculate tasks solved across all agents | |
tasks_solved = (pivot_df.sum(axis=0) > 0).astype(int) | |
# Total number of tasks (columns) | |
total_tasks = len(pivot_df.columns) | |
if benchmark_name == "SWE-bench Verified (Mini)": | |
total_tasks = 50 # TODO - remove hardcoding | |
# Add the new row to the pivot table | |
tasks_solved_df = pd.DataFrame(tasks_solved).T | |
tasks_solved_df.index = [f'<b>Tasks Solved: {tasks_solved.sum()}/{total_tasks} (Any Agent)</b>'] | |
# print number of tasks solved | |
pivot_df = pd.concat([pivot_df, tasks_solved_df]) | |
num_agents = len(pivot_df.index) | |
row_height = 30 # Fixed height for each row in pixels | |
total_height = num_agents * row_height | |
# Create a custom colorscale | |
colorscale=[[0, 'white'], [1, '#3498db']] | |
# Create the heatmap | |
fig = go.Figure(data=go.Heatmap( | |
z=pivot_df.values, | |
y=pivot_df.index, | |
x=pivot_df.columns, | |
colorscale=colorscale, | |
showscale=False, | |
hovertemplate='<b>Agent:</b> %{y}<br>' + | |
'<b>Task:</b> %{x}<br>' + | |
'<b>Status:</b> %{z}<extra></extra>' | |
)) | |
# Update the layout | |
fig.update_layout( | |
xaxis_title='Task ID', | |
height=total_height + 50, # Add extra space for the new row | |
yaxis=dict( | |
autorange='reversed', | |
showticklabels=True, | |
showline=True, | |
linecolor='black', | |
showgrid=False | |
), | |
xaxis=dict( | |
side='top', | |
showticklabels=False, | |
showline=True, | |
linecolor='black', | |
showgrid=False | |
), | |
plot_bgcolor='white', | |
paper_bgcolor='white', | |
hoverlabel=dict( | |
bgcolor="white", | |
font_size=12, | |
font_family="Arial" | |
), | |
modebar=dict( | |
activecolor='#1f77b4', | |
orientation='h', | |
bgcolor='rgba(255,255,255,0.8)', | |
color='#777', | |
add=['pan2d'], | |
remove=[ | |
'zoom2d', 'zoomIn2d', 'zoomOut2d', 'resetScale2d', | |
'hoverClosestCartesian', 'hoverCompareCartesian', | |
'toggleSpikelines', 'lasso2d', 'lasso', 'select2d', 'select' | |
] | |
), | |
dragmode='pan' | |
) | |
return fig | |
def create_bar_chart(categories, values, x_label, y_label, title): | |
# Sort categories and values based on values in descending order | |
sorted_data = sorted(zip(categories, values), key=lambda x: x[1], reverse=True) | |
categories, values = zip(*sorted_data) | |
# get total number of tasks | |
total_tasks = sum(values) | |
text_labels = [f"({value/total_tasks:.1%} of failures)" for value in values] | |
fig = go.Figure(data=[go.Bar( | |
y=categories, | |
x=values, | |
orientation='h', | |
marker_color='#3498db', # Same color as the scatter plot | |
text=text_labels, | |
textposition='auto', | |
customdata=[f'{value} tasks ({value/total_tasks:.1%} of failures)' for value in values], | |
textfont=dict(color='black', size=14, family='Arial', weight=2), | |
hovertemplate='<b>%{y}</b><br>' + | |
'Affected Tasks: %{customdata}<extra></extra>' | |
)]) | |
fig.update_layout( | |
height=600, | |
xaxis=dict( | |
showline=True, | |
linecolor='black', | |
showgrid=False | |
), | |
yaxis=dict( | |
showline=True, | |
linecolor='black', | |
showgrid=False, | |
autorange="reversed" # This will put the category with the highest value at the top | |
), | |
plot_bgcolor='white', | |
paper_bgcolor='white', | |
bargap=0.2, | |
bargroupgap=0.1, | |
hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"), | |
modebar=dict( | |
activecolor='#1f77b4', | |
orientation='h', | |
bgcolor='rgba(255,255,255,0.8)', | |
color='#777', | |
add=['pan2d'], | |
remove=[ | |
'zoom2d', 'zoomIn2d', 'zoomOut2d', 'resetScale2d', | |
'hoverClosestCartesian', 'hoverCompareCartesian', | |
'toggleSpikelines', 'lasso2d', 'lasso', 'select2d', 'select' | |
] | |
), | |
dragmode='pan' | |
) | |
return fig | |
def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str = None, hover_data: list = None): | |
# agents = [Agent(row['Total Cost'], row['Accuracy']) for i, row in df.iterrows()] | |
# instead of creating one Agent object for each row, we can create one Agent object for each unique agent and use the mean of the cost and accuracy values | |
unique_agents = df['Agent Name'].unique() | |
agents = [Agent(df[df['Agent Name'] == agent]['Total Cost'].mean(), df[df['Agent Name'] == agent]['Accuracy'].mean()) for agent in unique_agents] | |
pareto_frontier = compute_pareto_frontier(agents) | |
fig = go.Figure() | |
# Sort the Pareto frontier points by x-coordinate | |
pareto_points = sorted([(agent.total_cost, agent.accuracy) for agent in pareto_frontier], key=lambda x: x[0]) | |
# Add the Pareto frontier line | |
fig.add_trace(go.Scatter( | |
x=[point[0] for point in pareto_points], | |
y=[point[1] for point in pareto_points], | |
mode='lines', | |
name='Pareto Frontier', | |
hoverinfo=None, | |
line=dict(color='black', width=1, dash='dash') | |
)) | |
# Plot scatter points and error bars for each agent | |
unique_agents = df[hover_data[0]].unique() | |
# Create lists to store all point coordinates for label placement | |
all_x = [] | |
all_y = [] | |
all_labels = [] | |
for agent in unique_agents: | |
agent_data = df[df[hover_data[0]] == agent] | |
# remove url from tooltip name | |
def clean_agent_name(name): | |
if '[' in str(name): | |
return name[1:].rsplit(']')[0] | |
return name | |
# Apply the function to each element individually | |
agent_data.loc[:, 'Agent Name'] = agent_data['Agent Name'].apply(clean_agent_name) | |
x_value = [np.mean(agent_data[x].values)] | |
y_value = [np.mean(agent_data[y].values)] | |
# Store coordinates and label for later use | |
all_x.extend(x_value) | |
all_y.extend(y_value) | |
all_labels.extend([agent_data['Agent Name'].iloc[0]]) | |
if len(agent_data) > 1: | |
# Add error bars for x (cost minmax) | |
fig.add_trace(go.Scatter( | |
x=x_value, | |
y=y_value, | |
error_x=dict( | |
type='data', | |
symmetric=False, | |
array=[np.max(agent_data[x]) - x_value], | |
arrayminus=[x_value - np.min(agent_data[x])], | |
color='#fec44f', | |
), | |
mode='markers', | |
marker=dict(color='rgba(0,0,0,0)', opacity=0), | |
showlegend=False, | |
hoverinfo=None | |
)) | |
# Add error bars for y (accuracy minmax) | |
fig.add_trace(go.Scatter( | |
x=x_value, | |
y=y_value, | |
error_y=dict( | |
type='data', | |
symmetric=False, | |
array=[np.max(agent_data[y]) - y_value], | |
arrayminus=[y_value - np.min(agent_data[y])], | |
color='#bdbdbd', | |
), | |
mode='markers', | |
marker=dict(color='rgba(0,0,0,0)', opacity=0), | |
showlegend=False, | |
hoverinfo=None | |
)) | |
# Add scatter points for this agent | |
fig.add_trace(go.Scatter( | |
x=x_value, | |
y=y_value, | |
mode='markers', # Remove text mode, only use markers | |
marker=dict(size=10, color='#3498db'), | |
customdata=agent_data[hover_data], | |
showlegend=False, | |
hovertemplate="<br>".join([ | |
"<b>Agent</b>: %{customdata[0]}", | |
"<b>Total Cost</b>: $%{x:.1f}", | |
"<b>Accuracy</b>: %{y:.1%}<extra></extra>", | |
]), | |
hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"), | |
)) | |
# Add legend entries for error bars | |
fig.add_trace(go.Scatter( | |
x=[None], y=[None], mode='markers', | |
marker=dict(color='#fec44f', size=10), | |
name='Cost CI (Min-Max)' | |
)) | |
fig.add_trace(go.Scatter( | |
x=[None], y=[None], mode='markers', | |
marker=dict(color='#bdbdbd', size=10), | |
name='Accuracy CI (Min-Max)' | |
)) | |
# Update layout to handle overlapping labels | |
fig.update_layout( | |
height=600, | |
xaxis_title=x_label, | |
yaxis_title=y_label, | |
xaxis=dict( | |
showline=True, | |
linecolor='black', | |
showgrid=False | |
), | |
yaxis=dict( | |
showline=True, | |
showgrid=False, | |
linecolor='black' | |
), | |
plot_bgcolor='white', | |
legend=dict( | |
yanchor="bottom", | |
y=0.01, | |
xanchor="right", | |
x=0.98, | |
bgcolor="rgba(255, 255, 255, 0.5)" | |
), | |
modebar=dict( | |
activecolor='#1f77b4', | |
orientation='h', | |
bgcolor='rgba(255,255,255,0.8)', | |
color='#777', | |
add=['pan2d'], | |
remove=[ | |
'zoom2d', 'zoomIn2d', 'zoomOut2d', 'resetScale2d', | |
'hoverClosestCartesian', 'hoverCompareCartesian', | |
'toggleSpikelines', 'lasso2d', 'lasso', 'select2d', 'select' | |
] | |
), | |
dragmode='pan', | |
# Add configuration for handling overlapping labels | |
showlegend=True, | |
annotations=[], | |
) | |
# Add non-overlapping labels using annotations | |
for i in range(len(all_x)): | |
# Default position: lower right | |
ax = 20 | |
ay = 20 | |
# Adjust position if near axes | |
x_range = max(all_x) - min(all_x) | |
y_range = max(all_y) - min(all_y) | |
# If point is near minimum x-axis (left side) | |
if all_x[i] < min(all_x) + 0.05 * x_range: | |
ax = 120 # Large shift for points very close to left axis | |
# If point is near maximum x-axis (right side) | |
if all_x[i] > max(all_x) - 0.1 * x_range: | |
ax = -20 # Move label to the left | |
# If point is near minimum y-axis (bottom) | |
if all_y[i] < min(all_y) + 0.05 * y_range: | |
ay = -30 # Move label up | |
# If point is near maximum y-axis (top) | |
if all_y[i] > max(all_y) - 0.1 * y_range: | |
ay = -20 # Move label down | |
# Check for overlap with previous labels | |
overlap = False | |
for j in range(i): | |
# Simple distance check between points | |
dx = abs(all_x[i] - all_x[j]) | |
dy = abs(all_y[i] - all_y[j]) | |
# Reduced overlap threshold from 0.2 to 0.1 | |
if dx < 0.12 * x_range and dy < 0.12 * y_range: | |
# If points are close, try different positions | |
if not overlap: | |
ax += 20 # Smaller increment (from 40 to 20) | |
ay += 20 # Smaller increment (from 40 to 20) | |
overlap = True | |
if not overlap or i == 0: # Always show first label | |
fig.add_annotation( | |
x=all_x[i], | |
y=all_y[i], | |
text=all_labels[i], | |
showarrow=True, | |
arrowhead=0, | |
arrowsize=1, | |
arrowwidth=1, | |
arrowcolor="#CCCCCC", | |
ax=ax, | |
ay=ay, | |
font=dict( | |
size=10 | |
), | |
) | |
fig.update_yaxes(rangemode="tozero") | |
fig.update_xaxes(rangemode="tozero") | |
return fig | |
import plotly.graph_objects as go | |
import textwrap | |
def create_flow_chart(steps): | |
node_x = [] | |
node_y = [] | |
edge_x = [] | |
edge_y = [] | |
node_text = [] | |
hover_text = [] | |
node_colors = [] | |
node_shapes = [] | |
# Define color and shape mappings | |
color_map = {True: 'green', False: 'red'} # True for success, False for challenges | |
shape_map = { | |
'plan': 'octagon', | |
'tool': 'square', | |
'retrieve': 'diamond', | |
'other': 'circle' | |
} | |
for i, step in enumerate(steps): | |
node_x.append(i) | |
node_y.append(0) | |
# Extract Description, Assessment, and new attributes | |
analysis = step['analysis'] | |
if isinstance(analysis, str): | |
try: | |
analysis = json.loads(analysis) | |
except json.JSONDecodeError: | |
analysis = {} | |
description = analysis.get('description', 'No description available.') | |
assessment = analysis.get('assessment', 'No assessment available.') | |
success = analysis.get('success', True) # Assuming True if not specified | |
# action_type = analysis.get('action_type', 'other') # Default to 'other' if not specified | |
step_headline = analysis.get('headline', '') | |
# Set node color and shape based on attributes | |
node_colors.append(color_map[success]) | |
# node_shapes.append(shape_map.get(action_type, 'circle')) | |
# Wrap text to improve readability | |
wrapped_description = '<br>'.join(textwrap.wrap(description, width=90, max_lines=20)) | |
wrapped_assessment = '<br>'.join(textwrap.wrap(assessment, width=90, max_lines=10)) | |
wrapped_outline = textwrap.shorten(step_headline, width=50, placeholder='') | |
wrapped_outline = '' if wrapped_outline == '' else f": {wrapped_outline}" | |
node_text_outline = '' if wrapped_outline == '' else f":<br>{'<br>'.join(textwrap.wrap(step_headline, width=30, placeholder=''))}" | |
node_text.append(f"Step {i+1}{node_text_outline}") | |
# Create formatted hover text without indentation | |
hover_info = f"<b>Step {i+1}{wrapped_outline}</b><br><br>" \ | |
f"<b>Description:</b><br>" \ | |
f"{wrapped_description}<br><br>" \ | |
# f"<b>Assessment:</b><br>" \ | |
# f"{wrapped_assessment}<br><br>" \ | |
# f"<b>Successful:</b> {'Yes' if success else 'No'}<br>" \ | |
# f"<b>Action Type:</b> {action_type.capitalize()}" | |
hover_text.append(hover_info) | |
if i > 0: | |
edge_x.extend([i-1, i, None]) | |
edge_y.extend([0, 0, None]) | |
node_trace = go.Scatter( | |
x=node_x, y=node_y, | |
mode='markers+text', | |
text=node_text, | |
textposition="top center", | |
showlegend=False, | |
hovertext=hover_text, | |
hoverinfo='text', | |
hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"), | |
marker=dict( | |
# color=node_colors, | |
color='#3498db', | |
size=30, | |
line_width=2, | |
# symbol=node_shapes | |
)) | |
edge_trace = go.Scatter( | |
x=edge_x, y=edge_y, | |
line=dict(width=2, color='#888'), | |
hoverinfo='none', | |
showlegend=False, | |
mode='lines') | |
# Create legend traces | |
legend_traces = [] | |
# # Color legend | |
# for success, color in color_map.items(): | |
# legend_traces.append(go.Scatter( | |
# x=[None], y=[None], | |
# mode='markers', | |
# marker=dict(size=10, color=color), | |
# showlegend=True, | |
# name=f"{'Success' if success else 'Issue'}" | |
# )) | |
# # Shape legend | |
# for action, shape in shape_map.items(): | |
# legend_traces.append(go.Scatter( | |
# x=[None], y=[None], | |
# mode='markers', | |
# marker=dict(size=10, symbol=shape, color='gray'), | |
# showlegend=True, | |
# name=f"{action.capitalize()}" | |
# )) | |
# Combine all traces | |
all_traces = [edge_trace, node_trace] + legend_traces | |
layout = go.Layout( | |
showlegend=True, | |
hovermode='closest', | |
margin=dict(b=20,l=5,r=5,t=40), | |
xaxis=dict(showgrid=False, zeroline=False, showticklabels=False), | |
yaxis=dict(showgrid=False, zeroline=False, showticklabels=False), | |
plot_bgcolor='white', | |
paper_bgcolor='white', | |
modebar=dict( | |
activecolor='#1f77b4', # Color of active tool | |
orientation='h', # Vertical orientation | |
bgcolor='rgba(255,255,255,0.8)', # Slightly transparent white background | |
color='#777', # Color of inactive tools | |
), | |
legend=dict( | |
orientation="h", | |
yanchor="bottom", | |
y=0.02, | |
xanchor="right", | |
x=1, | |
bgcolor='rgba(255,255,255,0.8)', | |
bordercolor='rgba(0,0,0,0.1)', | |
borderwidth=1 | |
), | |
) | |
fig = go.Figure(data=all_traces, layout=layout) | |
fig.update_layout(legend=dict( | |
orientation="h", | |
yanchor="bottom", | |
y=1.02, | |
xanchor="right", | |
x=1, | |
bgcolor='rgba(255,255,255,0.8)', # Set legend background to slightly transparent white | |
bordercolor='rgba(0,0,0,0.1)', # Add a light border to the legend | |
borderwidth=1 | |
), | |
dragmode='pan' | |
) | |
config = { | |
'add': ['pan2d'], | |
'remove': [ | |
'zoom2d', | |
'zoomIn2d', | |
'zoomOut2d', | |
'resetScale2d', | |
'hoverClosestCartesian', | |
'hoverCompareCartesian', | |
'toggleSpikelines', | |
'lasso2d', | |
'lasso', | |
'select2d', | |
'select', | |
] | |
} | |
# Apply the config to the figure | |
fig.update_layout(modebar=config) | |
return fig |