import json import plotly.express as px from utils.pareto import Agent, compute_pareto_frontier import plotly.graph_objects as go import textwrap import numpy as np import pandas as pd from scipy import stats def create_leaderboard(df, ci_metrics = None): # cast dtypes to string df = df.astype(str) # for each metric join metric and metric CI columns if ci_metrics: for metric in ci_metrics: CI_metric = metric + ' CI' # for rows in the df for which CI metric is not None, join the metric and CI columns by looping through the CI metrics columns for i, row in df.iterrows(): if str(row[CI_metric]) != 'None': df.at[i, metric] = str(round(float(row[metric]), 2)) + " (" + str(row[CI_metric]) + ")" return df def create_task_success_heatmap(df, benchmark_name): # Calculate agent accuracy agent_accuracy = df.groupby('Agent Name')['Success'].mean().sort_values(ascending=False) # Calculate task success rate task_success_rate = df.groupby('Task ID')['Success'].mean().sort_values(ascending=False) # Pivot the dataframe to create a matrix of agents vs tasks pivot_df = df.pivot(index='Agent Name', columns='Task ID', values='Success') # Sort the pivot table pivot_df = pivot_df.reindex(index=agent_accuracy.index, columns=task_success_rate.index) # Calculate tasks solved across all agents tasks_solved = (pivot_df.sum(axis=0) > 0).astype(int) # Total number of tasks (columns) total_tasks = len(pivot_df.columns) if benchmark_name == "SWE-bench Verified (Mini)": total_tasks = 50 # TODO - remove hardcoding # Add the new row to the pivot table tasks_solved_df = pd.DataFrame(tasks_solved).T tasks_solved_df.index = [f'Tasks Solved: {tasks_solved.sum()}/{total_tasks} (Any Agent)'] # print number of tasks solved pivot_df = pd.concat([pivot_df, tasks_solved_df]) num_agents = len(pivot_df.index) row_height = 30 # Fixed height for each row in pixels total_height = num_agents * row_height # Create a custom colorscale colorscale=[[0, 'white'], [1, '#3498db']] # Create the heatmap fig = go.Figure(data=go.Heatmap( z=pivot_df.values, y=pivot_df.index, x=pivot_df.columns, colorscale=colorscale, showscale=False, hovertemplate='Agent: %{y}
' + 'Task: %{x}
' + 'Status: %{z}' )) # Update the layout fig.update_layout( xaxis_title='Task ID', height=total_height + 50, # Add extra space for the new row yaxis=dict( autorange='reversed', showticklabels=True, showline=True, linecolor='black', showgrid=False ), xaxis=dict( side='top', showticklabels=False, showline=True, linecolor='black', showgrid=False ), plot_bgcolor='white', paper_bgcolor='white', hoverlabel=dict( bgcolor="white", font_size=12, font_family="Arial" ), modebar=dict( activecolor='#1f77b4', orientation='h', bgcolor='rgba(255,255,255,0.8)', color='#777', add=['pan2d'], remove=[ 'zoom2d', 'zoomIn2d', 'zoomOut2d', 'resetScale2d', 'hoverClosestCartesian', 'hoverCompareCartesian', 'toggleSpikelines', 'lasso2d', 'lasso', 'select2d', 'select' ] ), dragmode='pan' ) return fig def create_bar_chart(categories, values, x_label, y_label, title): # Sort categories and values based on values in descending order sorted_data = sorted(zip(categories, values), key=lambda x: x[1], reverse=True) categories, values = zip(*sorted_data) # get total number of tasks total_tasks = sum(values) text_labels = [f"({value/total_tasks:.1%} of failures)" for value in values] fig = go.Figure(data=[go.Bar( y=categories, x=values, orientation='h', marker_color='#3498db', # Same color as the scatter plot text=text_labels, textposition='auto', customdata=[f'{value} tasks ({value/total_tasks:.1%} of failures)' for value in values], textfont=dict(color='black', size=14, family='Arial', weight=2), hovertemplate='%{y}
' + 'Affected Tasks: %{customdata}' )]) fig.update_layout( height=600, xaxis=dict( showline=True, linecolor='black', showgrid=False ), yaxis=dict( showline=True, linecolor='black', showgrid=False, autorange="reversed" # This will put the category with the highest value at the top ), plot_bgcolor='white', paper_bgcolor='white', bargap=0.2, bargroupgap=0.1, hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"), modebar=dict( activecolor='#1f77b4', orientation='h', bgcolor='rgba(255,255,255,0.8)', color='#777', add=['pan2d'], remove=[ 'zoom2d', 'zoomIn2d', 'zoomOut2d', 'resetScale2d', 'hoverClosestCartesian', 'hoverCompareCartesian', 'toggleSpikelines', 'lasso2d', 'lasso', 'select2d', 'select' ] ), dragmode='pan' ) return fig def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str = None, hover_data: list = None): # agents = [Agent(row['Total Cost'], row['Accuracy']) for i, row in df.iterrows()] # instead of creating one Agent object for each row, we can create one Agent object for each unique agent and use the mean of the cost and accuracy values unique_agents = df['Agent Name'].unique() agents = [Agent(df[df['Agent Name'] == agent]['Total Cost'].mean(), df[df['Agent Name'] == agent]['Accuracy'].mean()) for agent in unique_agents] pareto_frontier = compute_pareto_frontier(agents) fig = go.Figure() # Sort the Pareto frontier points by x-coordinate pareto_points = sorted([(agent.total_cost, agent.accuracy) for agent in pareto_frontier], key=lambda x: x[0]) # Add the Pareto frontier line fig.add_trace(go.Scatter( x=[point[0] for point in pareto_points], y=[point[1] for point in pareto_points], mode='lines', name='Pareto Frontier', hoverinfo=None, line=dict(color='black', width=1, dash='dash') )) # Plot scatter points and error bars for each agent unique_agents = df[hover_data[0]].unique() # Create lists to store all point coordinates for label placement all_x = [] all_y = [] all_labels = [] for agent in unique_agents: agent_data = df[df[hover_data[0]] == agent] # remove url from tooltip name def clean_agent_name(name): if '[' in str(name): return name[1:].rsplit(']')[0] return name # Apply the function to each element individually agent_data.loc[:, 'Agent Name'] = agent_data['Agent Name'].apply(clean_agent_name) x_value = [np.mean(agent_data[x].values)] y_value = [np.mean(agent_data[y].values)] # Store coordinates and label for later use all_x.extend(x_value) all_y.extend(y_value) all_labels.extend([agent_data['Agent Name'].iloc[0]]) if len(agent_data) > 1: # Add error bars for x (cost minmax) fig.add_trace(go.Scatter( x=x_value, y=y_value, error_x=dict( type='data', symmetric=False, array=[np.max(agent_data[x]) - x_value], arrayminus=[x_value - np.min(agent_data[x])], color='#fec44f', ), mode='markers', marker=dict(color='rgba(0,0,0,0)', opacity=0), showlegend=False, hoverinfo=None )) # Add error bars for y (accuracy minmax) fig.add_trace(go.Scatter( x=x_value, y=y_value, error_y=dict( type='data', symmetric=False, array=[np.max(agent_data[y]) - y_value], arrayminus=[y_value - np.min(agent_data[y])], color='#bdbdbd', ), mode='markers', marker=dict(color='rgba(0,0,0,0)', opacity=0), showlegend=False, hoverinfo=None )) # Add scatter points for this agent fig.add_trace(go.Scatter( x=x_value, y=y_value, mode='markers', # Remove text mode, only use markers marker=dict(size=10, color='#3498db'), customdata=agent_data[hover_data], showlegend=False, hovertemplate="
".join([ "Agent: %{customdata[0]}", "Total Cost: $%{x:.1f}", "Accuracy: %{y:.1%}", ]), hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"), )) # Add legend entries for error bars fig.add_trace(go.Scatter( x=[None], y=[None], mode='markers', marker=dict(color='#fec44f', size=10), name='Cost CI (Min-Max)' )) fig.add_trace(go.Scatter( x=[None], y=[None], mode='markers', marker=dict(color='#bdbdbd', size=10), name='Accuracy CI (Min-Max)' )) # Update layout to handle overlapping labels fig.update_layout( height=600, xaxis_title=x_label, yaxis_title=y_label, xaxis=dict( showline=True, linecolor='black', showgrid=False ), yaxis=dict( showline=True, showgrid=False, linecolor='black' ), plot_bgcolor='white', legend=dict( yanchor="bottom", y=0.01, xanchor="right", x=0.98, bgcolor="rgba(255, 255, 255, 0.5)" ), modebar=dict( activecolor='#1f77b4', orientation='h', bgcolor='rgba(255,255,255,0.8)', color='#777', add=['pan2d'], remove=[ 'zoom2d', 'zoomIn2d', 'zoomOut2d', 'resetScale2d', 'hoverClosestCartesian', 'hoverCompareCartesian', 'toggleSpikelines', 'lasso2d', 'lasso', 'select2d', 'select' ] ), dragmode='pan', # Add configuration for handling overlapping labels showlegend=True, annotations=[], ) # Add non-overlapping labels using annotations for i in range(len(all_x)): # Default position: lower right ax = 20 ay = 20 # Adjust position if near axes x_range = max(all_x) - min(all_x) y_range = max(all_y) - min(all_y) # If point is near minimum x-axis (left side) if all_x[i] < min(all_x) + 0.05 * x_range: ax = 120 # Large shift for points very close to left axis # If point is near maximum x-axis (right side) if all_x[i] > max(all_x) - 0.1 * x_range: ax = -20 # Move label to the left # If point is near minimum y-axis (bottom) if all_y[i] < min(all_y) + 0.05 * y_range: ay = -30 # Move label up # If point is near maximum y-axis (top) if all_y[i] > max(all_y) - 0.1 * y_range: ay = -20 # Move label down # Check for overlap with previous labels overlap = False for j in range(i): # Simple distance check between points dx = abs(all_x[i] - all_x[j]) dy = abs(all_y[i] - all_y[j]) # Reduced overlap threshold from 0.2 to 0.1 if dx < 0.12 * x_range and dy < 0.12 * y_range: # If points are close, try different positions if not overlap: ax += 20 # Smaller increment (from 40 to 20) ay += 20 # Smaller increment (from 40 to 20) overlap = True if not overlap or i == 0: # Always show first label fig.add_annotation( x=all_x[i], y=all_y[i], text=all_labels[i], showarrow=True, arrowhead=0, arrowsize=1, arrowwidth=1, arrowcolor="#CCCCCC", ax=ax, ay=ay, font=dict( size=10 ), ) fig.update_yaxes(rangemode="tozero") fig.update_xaxes(rangemode="tozero") return fig import plotly.graph_objects as go import textwrap def create_flow_chart(steps): node_x = [] node_y = [] edge_x = [] edge_y = [] node_text = [] hover_text = [] node_colors = [] node_shapes = [] # Define color and shape mappings color_map = {True: 'green', False: 'red'} # True for success, False for challenges shape_map = { 'plan': 'octagon', 'tool': 'square', 'retrieve': 'diamond', 'other': 'circle' } for i, step in enumerate(steps): node_x.append(i) node_y.append(0) # Extract Description, Assessment, and new attributes analysis = step['analysis'] if isinstance(analysis, str): try: analysis = json.loads(analysis) except json.JSONDecodeError: analysis = {} description = analysis.get('description', 'No description available.') assessment = analysis.get('assessment', 'No assessment available.') success = analysis.get('success', True) # Assuming True if not specified # action_type = analysis.get('action_type', 'other') # Default to 'other' if not specified step_headline = analysis.get('headline', '') # Set node color and shape based on attributes node_colors.append(color_map[success]) # node_shapes.append(shape_map.get(action_type, 'circle')) # Wrap text to improve readability wrapped_description = '
'.join(textwrap.wrap(description, width=90, max_lines=20)) wrapped_assessment = '
'.join(textwrap.wrap(assessment, width=90, max_lines=10)) wrapped_outline = textwrap.shorten(step_headline, width=50, placeholder='') wrapped_outline = '' if wrapped_outline == '' else f": {wrapped_outline}" node_text_outline = '' if wrapped_outline == '' else f":
{'
'.join(textwrap.wrap(step_headline, width=30, placeholder=''))}" node_text.append(f"Step {i+1}{node_text_outline}") # Create formatted hover text without indentation hover_info = f"Step {i+1}{wrapped_outline}

" \ f"Description:
" \ f"{wrapped_description}

" \ # f"Assessment:
" \ # f"{wrapped_assessment}

" \ # f"Successful: {'Yes' if success else 'No'}
" \ # f"Action Type: {action_type.capitalize()}" hover_text.append(hover_info) if i > 0: edge_x.extend([i-1, i, None]) edge_y.extend([0, 0, None]) node_trace = go.Scatter( x=node_x, y=node_y, mode='markers+text', text=node_text, textposition="top center", showlegend=False, hovertext=hover_text, hoverinfo='text', hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"), marker=dict( # color=node_colors, color='#3498db', size=30, line_width=2, # symbol=node_shapes )) edge_trace = go.Scatter( x=edge_x, y=edge_y, line=dict(width=2, color='#888'), hoverinfo='none', showlegend=False, mode='lines') # Create legend traces legend_traces = [] # # Color legend # for success, color in color_map.items(): # legend_traces.append(go.Scatter( # x=[None], y=[None], # mode='markers', # marker=dict(size=10, color=color), # showlegend=True, # name=f"{'Success' if success else 'Issue'}" # )) # # Shape legend # for action, shape in shape_map.items(): # legend_traces.append(go.Scatter( # x=[None], y=[None], # mode='markers', # marker=dict(size=10, symbol=shape, color='gray'), # showlegend=True, # name=f"{action.capitalize()}" # )) # Combine all traces all_traces = [edge_trace, node_trace] + legend_traces layout = go.Layout( showlegend=True, hovermode='closest', margin=dict(b=20,l=5,r=5,t=40), xaxis=dict(showgrid=False, zeroline=False, showticklabels=False), yaxis=dict(showgrid=False, zeroline=False, showticklabels=False), plot_bgcolor='white', paper_bgcolor='white', modebar=dict( activecolor='#1f77b4', # Color of active tool orientation='h', # Vertical orientation bgcolor='rgba(255,255,255,0.8)', # Slightly transparent white background color='#777', # Color of inactive tools ), legend=dict( orientation="h", yanchor="bottom", y=0.02, xanchor="right", x=1, bgcolor='rgba(255,255,255,0.8)', bordercolor='rgba(0,0,0,0.1)', borderwidth=1 ), ) fig = go.Figure(data=all_traces, layout=layout) fig.update_layout(legend=dict( orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1, bgcolor='rgba(255,255,255,0.8)', # Set legend background to slightly transparent white bordercolor='rgba(0,0,0,0.1)', # Add a light border to the legend borderwidth=1 ), dragmode='pan' ) config = { 'add': ['pan2d'], 'remove': [ 'zoom2d', 'zoomIn2d', 'zoomOut2d', 'resetScale2d', 'hoverClosestCartesian', 'hoverCompareCartesian', 'toggleSpikelines', 'lasso2d', 'lasso', 'select2d', 'select', ] } # Apply the config to the figure fig.update_layout(modebar=config) return fig