Spaces:

agent-evals
/

leaderboard

Running

File size: 19,280 Bytes

import json
import plotly.express as px
from utils.pareto import Agent, compute_pareto_frontier
import plotly.graph_objects as go
import textwrap
import numpy as np
import pandas as pd
from scipy import stats


def create_leaderboard(df, ci_metrics = None):
    # cast dtypes to string
    df = df.astype(str)

    # for each metric join metric and metric CI columns
    if ci_metrics:
        for metric in ci_metrics:
            CI_metric = metric + ' CI'
            # for rows in the df for which CI metric is not None, join the metric and CI columns by looping through the CI metrics columns
            for i, row in df.iterrows():
                if str(row[CI_metric]) != 'None':
                    df.at[i, metric] = str(round(float(row[metric]), 2)) + " (" + str(row[CI_metric]) + ")"

    return df

def create_task_success_heatmap(df, benchmark_name):

    # Calculate agent accuracy
    agent_accuracy = df.groupby('Agent Name')['Success'].mean().sort_values(ascending=False)
    
    # Calculate task success rate
    task_success_rate = df.groupby('Task ID')['Success'].mean().sort_values(ascending=False)
    
    # Pivot the dataframe to create a matrix of agents vs tasks
    pivot_df = df.pivot(index='Agent Name', columns='Task ID', values='Success')
    
    # Sort the pivot table
    pivot_df = pivot_df.reindex(index=agent_accuracy.index, columns=task_success_rate.index)

    # Calculate tasks solved across all agents
    tasks_solved = (pivot_df.sum(axis=0) > 0).astype(int)
    # Total number of tasks (columns)
    total_tasks = len(pivot_df.columns)
    
    if benchmark_name == "SWE-bench Verified (Mini)":
        total_tasks = 50 # TODO - remove hardcoding
    
    # Add the new row to the pivot table
    tasks_solved_df = pd.DataFrame(tasks_solved).T
    tasks_solved_df.index = [f'<b>Tasks Solved: {tasks_solved.sum()}/{total_tasks} (Any Agent)</b>']
    # print number of tasks solved
    pivot_df = pd.concat([pivot_df, tasks_solved_df])

    num_agents = len(pivot_df.index)
    row_height = 30  # Fixed height for each row in pixels
    total_height = num_agents * row_height
    
    # Create a custom colorscale
    colorscale=[[0, 'white'], [1, '#3498db']]

    # Create the heatmap
    fig = go.Figure(data=go.Heatmap(
        z=pivot_df.values,
        y=pivot_df.index,
        x=pivot_df.columns,
        colorscale=colorscale,
        showscale=False,
        hovertemplate='<b>Agent:</b> %{y}<br>' +
                      '<b>Task:</b> %{x}<br>' +
                      '<b>Status:</b> %{z}<extra></extra>'
    ))
    
    # Update the layout
    fig.update_layout(
        xaxis_title='Task ID',
        height=total_height + 50,  # Add extra space for the new row
        yaxis=dict(
            autorange='reversed',
            showticklabels=True,
            showline=True,
            linecolor='black',
            showgrid=False
        ),
        xaxis=dict(
            side='top',
            showticklabels=False,
            showline=True,
            linecolor='black',
            showgrid=False
        ),
        plot_bgcolor='white',
        paper_bgcolor='white',
        hoverlabel=dict(
            bgcolor="white", 
            font_size=12, 
            font_family="Arial"
        ),
        modebar=dict(
            activecolor='#1f77b4',
            orientation='h',
            bgcolor='rgba(255,255,255,0.8)',
            color='#777',
            add=['pan2d'],
            remove=[
                'zoom2d', 'zoomIn2d', 'zoomOut2d', 'resetScale2d',
                'hoverClosestCartesian', 'hoverCompareCartesian',
                'toggleSpikelines', 'lasso2d', 'lasso', 'select2d', 'select'
            ]
        ),
        dragmode='pan'
    )
    
    return fig

def create_bar_chart(categories, values, x_label, y_label, title):
    # Sort categories and values based on values in descending order
    sorted_data = sorted(zip(categories, values), key=lambda x: x[1], reverse=True)
    categories, values = zip(*sorted_data)

    # get total number of tasks
    total_tasks = sum(values)

    text_labels = [f"({value/total_tasks:.1%} of failures)" for value in values]


    fig = go.Figure(data=[go.Bar(
        y=categories,
        x=values,
        orientation='h',
        marker_color='#3498db',  # Same color as the scatter plot
        text=text_labels,
        textposition='auto',
        customdata=[f'{value} tasks ({value/total_tasks:.1%} of failures)' for value in values],
        textfont=dict(color='black', size=14, family='Arial', weight=2),
        hovertemplate='<b>%{y}</b><br>' +
                      'Affected Tasks: %{customdata}<extra></extra>'
    )])

    fig.update_layout(
        height=600,
        xaxis=dict(
            showline=True,
            linecolor='black',
            showgrid=False
        ),
        yaxis=dict(
            showline=True,
            linecolor='black',
            showgrid=False,
            autorange="reversed"  # This will put the category with the highest value at the top
        ),
        plot_bgcolor='white',
        paper_bgcolor='white',
        bargap=0.2,
        bargroupgap=0.1,
        hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"),
        modebar=dict(
            activecolor='#1f77b4',
            orientation='h',
            bgcolor='rgba(255,255,255,0.8)',
            color='#777',
            add=['pan2d'],
            remove=[
                'zoom2d', 'zoomIn2d', 'zoomOut2d', 'resetScale2d',
                'hoverClosestCartesian', 'hoverCompareCartesian',
                'toggleSpikelines', 'lasso2d', 'lasso', 'select2d', 'select'
            ]
        ),
        dragmode='pan'
    )

    return fig

def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str = None, hover_data: list = None):
    # agents = [Agent(row['Total Cost'], row['Accuracy']) for i, row in df.iterrows()]
    # instead of creating one Agent object for each row, we can create one Agent object for each unique agent and use the mean of the cost and accuracy values
    unique_agents = df['Agent Name'].unique()
    agents = [Agent(df[df['Agent Name'] == agent]['Total Cost'].mean(), df[df['Agent Name'] == agent]['Accuracy'].mean()) for agent in unique_agents]

    pareto_frontier = compute_pareto_frontier(agents)

    fig = go.Figure()

    # Sort the Pareto frontier points by x-coordinate
    pareto_points = sorted([(agent.total_cost, agent.accuracy) for agent in pareto_frontier], key=lambda x: x[0])
    # Add the Pareto frontier line
    fig.add_trace(go.Scatter(
        x=[point[0] for point in pareto_points],
        y=[point[1] for point in pareto_points],
        mode='lines',
        name='Pareto Frontier',
        hoverinfo=None,
        line=dict(color='black', width=1, dash='dash')
    ))

    # Plot scatter points and error bars for each agent
    unique_agents = df[hover_data[0]].unique()
    
    # Create lists to store all point coordinates for label placement
    all_x = []
    all_y = []
    all_labels = []
    
    for agent in unique_agents:
        agent_data = df[df[hover_data[0]] == agent]
        
        # remove url from tooltip name
        def clean_agent_name(name):
            if '[' in str(name):
                return name[1:].rsplit(']')[0]
            return name

        # Apply the function to each element individually
        agent_data.loc[:, 'Agent Name'] = agent_data['Agent Name'].apply(clean_agent_name)
        
        x_value = [np.mean(agent_data[x].values)]
        y_value = [np.mean(agent_data[y].values)]
        
        # Store coordinates and label for later use
        all_x.extend(x_value)
        all_y.extend(y_value)
        all_labels.extend([agent_data['Agent Name'].iloc[0]])

        if len(agent_data) > 1:
            
            # Add error bars for x (cost minmax)
            fig.add_trace(go.Scatter(
                x=x_value,
                y=y_value,
                error_x=dict(
                    type='data',
                    symmetric=False,
                    array=[np.max(agent_data[x]) - x_value],
                    arrayminus=[x_value - np.min(agent_data[x])],
                    color='#fec44f',
                ),
                mode='markers',
                marker=dict(color='rgba(0,0,0,0)', opacity=0),
                showlegend=False,
                hoverinfo=None
            ))

            # Add error bars for y (accuracy minmax)
            fig.add_trace(go.Scatter(
                x=x_value,
                y=y_value,
                error_y=dict(
                    type='data',
                    symmetric=False,
                    array=[np.max(agent_data[y]) - y_value],
                    arrayminus=[y_value - np.min(agent_data[y])],
                    color='#bdbdbd',
                ),
                mode='markers',
                marker=dict(color='rgba(0,0,0,0)', opacity=0),
                showlegend=False,
                hoverinfo=None
            ))

        # Add scatter points for this agent
        fig.add_trace(go.Scatter(
            x=x_value,
            y=y_value,
            mode='markers',  # Remove text mode, only use markers
            marker=dict(size=10, color='#3498db'),
            customdata=agent_data[hover_data],
            showlegend=False,
            hovertemplate="<br>".join([
                "<b>Agent</b>: %{customdata[0]}",
                "<b>Total Cost</b>: $%{x:.1f}",
                "<b>Accuracy</b>: %{y:.1%}<extra></extra>",
            ]),
            hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"),
        ))

    # Add legend entries for error bars
    fig.add_trace(go.Scatter(
        x=[None], y=[None], mode='markers',
        marker=dict(color='#fec44f', size=10),
        name='Cost CI (Min-Max)'
    ))
    fig.add_trace(go.Scatter(
        x=[None], y=[None], mode='markers',
        marker=dict(color='#bdbdbd', size=10),
        name='Accuracy CI (Min-Max)'
    ))

    # Update layout to handle overlapping labels
    fig.update_layout(
        height=600,
        xaxis_title=x_label,
        yaxis_title=y_label,
        xaxis=dict(
            showline=True,
            linecolor='black',
            showgrid=False
        ),
        yaxis=dict(
            showline=True,
            showgrid=False,
            linecolor='black'
        ),
        plot_bgcolor='white',
        legend=dict(
            yanchor="bottom",
            y=0.01,
            xanchor="right",
            x=0.98,
            bgcolor="rgba(255, 255, 255, 0.5)"
        ),
        modebar=dict(
            activecolor='#1f77b4',
            orientation='h',
            bgcolor='rgba(255,255,255,0.8)',
            color='#777',
            add=['pan2d'],
            remove=[
                'zoom2d', 'zoomIn2d', 'zoomOut2d', 'resetScale2d',
                'hoverClosestCartesian', 'hoverCompareCartesian',
                'toggleSpikelines', 'lasso2d', 'lasso', 'select2d', 'select'
            ]
        ),
        dragmode='pan',
        # Add configuration for handling overlapping labels
        showlegend=True,
        annotations=[],
    )

    # Add non-overlapping labels using annotations
    for i in range(len(all_x)):
        # Default position: lower right
        ax = 20
        ay = 20
        
        # Adjust position if near axes
        x_range = max(all_x) - min(all_x)
        y_range = max(all_y) - min(all_y)
        
        # If point is near minimum x-axis (left side)
        if all_x[i] < min(all_x) + 0.05 * x_range:
            ax = 120  # Large shift for points very close to left axis
            
        # If point is near maximum x-axis (right side)
        if all_x[i] > max(all_x) - 0.1 * x_range:
            ax = -20  # Move label to the left
        
        # If point is near minimum y-axis (bottom)
        if all_y[i] < min(all_y) + 0.05 * y_range:
            ay = -30  # Move label up
        
        # If point is near maximum y-axis (top)
        if all_y[i] > max(all_y) - 0.1 * y_range:
            ay = -20  # Move label down
               
        # Check for overlap with previous labels
        overlap = False
        for j in range(i):
            # Simple distance check between points
            dx = abs(all_x[i] - all_x[j])
            dy = abs(all_y[i] - all_y[j])
            
            # Reduced overlap threshold from 0.2 to 0.1
            if dx < 0.12 * x_range and dy < 0.12 * y_range:
                # If points are close, try different positions
                if not overlap:
                    ax += 20  # Smaller increment (from 40 to 20)
                    ay += 20  # Smaller increment (from 40 to 20)
                overlap = True
        
        if not overlap or i == 0:  # Always show first label
            fig.add_annotation(
                x=all_x[i],
                y=all_y[i],
                text=all_labels[i],
                showarrow=True,
                arrowhead=0,
                arrowsize=1,
                arrowwidth=1,
                arrowcolor="#CCCCCC",
                ax=ax,
                ay=ay,
                font=dict(
                    size=10
                ),
            )

    fig.update_yaxes(rangemode="tozero")
    fig.update_xaxes(rangemode="tozero")

    return fig


import plotly.graph_objects as go
import textwrap

def create_flow_chart(steps):
    node_x = []
    node_y = []
    edge_x = []
    edge_y = []
    node_text = []
    hover_text = []
    node_colors = []
    node_shapes = []
    
    # Define color and shape mappings
    color_map = {True: 'green', False: 'red'}  # True for success, False for challenges
    shape_map = {
        'plan': 'octagon',
        'tool': 'square',
        'retrieve': 'diamond',
        'other': 'circle'
    }
    
    for i, step in enumerate(steps):
        node_x.append(i)
        node_y.append(0)
        
        # Extract Description, Assessment, and new attributes
        analysis = step['analysis']
        if isinstance(analysis, str):
            try:
                analysis = json.loads(analysis)
            except json.JSONDecodeError:
                analysis = {}
        
        description = analysis.get('description', 'No description available.')
        assessment = analysis.get('assessment', 'No assessment available.')
        success = analysis.get('success', True)  # Assuming True if not specified
        # action_type = analysis.get('action_type', 'other')  # Default to 'other' if not specified
        step_headline = analysis.get('headline', '')
        
        # Set node color and shape based on attributes
        node_colors.append(color_map[success])
        # node_shapes.append(shape_map.get(action_type, 'circle'))
        
        # Wrap text to improve readability
        wrapped_description = '<br>'.join(textwrap.wrap(description, width=90, max_lines=20))
        wrapped_assessment = '<br>'.join(textwrap.wrap(assessment, width=90, max_lines=10))
        wrapped_outline = textwrap.shorten(step_headline, width=50, placeholder='')
        wrapped_outline = '' if wrapped_outline == '' else f": {wrapped_outline}"

        node_text_outline = '' if wrapped_outline == '' else f":<br>{'<br>'.join(textwrap.wrap(step_headline, width=30, placeholder=''))}"
        node_text.append(f"Step {i+1}{node_text_outline}")
        
        # Create formatted hover text without indentation
        hover_info = f"<b>Step {i+1}{wrapped_outline}</b><br><br>" \
                     f"<b>Description:</b><br>" \
                     f"{wrapped_description}<br><br>" \
                    #  f"<b>Assessment:</b><br>" \
                    #  f"{wrapped_assessment}<br><br>" \
                    #  f"<b>Successful:</b> {'Yes' if success else 'No'}<br>" \
                    #  f"<b>Action Type:</b> {action_type.capitalize()}"
        hover_text.append(hover_info)
        
        if i > 0:
            edge_x.extend([i-1, i, None])
            edge_y.extend([0, 0, None])
    
    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers+text',
        text=node_text,
        textposition="top center",
        showlegend=False,
        hovertext=hover_text,
        hoverinfo='text',
        hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"),
        marker=dict(
            # color=node_colors,
            color='#3498db',
            size=30,
            line_width=2,
            # symbol=node_shapes
        ))

    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=2, color='#888'),
        hoverinfo='none',
        showlegend=False,
        mode='lines')
    
    # Create legend traces
    legend_traces = []
    
    # # Color legend
    # for success, color in color_map.items():
    #     legend_traces.append(go.Scatter(
    #         x=[None], y=[None],
    #         mode='markers',
    #         marker=dict(size=10, color=color),
    #         showlegend=True,
    #         name=f"{'Success' if success else 'Issue'}"
    #     ))
    
    # # Shape legend
    # for action, shape in shape_map.items():
    #     legend_traces.append(go.Scatter(
    #         x=[None], y=[None],
    #         mode='markers',
    #         marker=dict(size=10, symbol=shape, color='gray'),
    #         showlegend=True,
    #         name=f"{action.capitalize()}"
    #     ))

    # Combine all traces
    all_traces = [edge_trace, node_trace] + legend_traces

    layout = go.Layout(
        showlegend=True,
        hovermode='closest',
        margin=dict(b=20,l=5,r=5,t=40),
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        plot_bgcolor='white',
        paper_bgcolor='white',
        modebar=dict(
            activecolor='#1f77b4',  # Color of active tool
            orientation='h',  # Vertical orientation
            bgcolor='rgba(255,255,255,0.8)',  # Slightly transparent white background
            color='#777',  # Color of inactive tools
        ),
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=0.02,
            xanchor="right",
            x=1,
            bgcolor='rgba(255,255,255,0.8)',
            bordercolor='rgba(0,0,0,0.1)',
            borderwidth=1
        ),
    )
    
    fig = go.Figure(data=all_traces, layout=layout)
    
    fig.update_layout(legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1,
        bgcolor='rgba(255,255,255,0.8)',  # Set legend background to slightly transparent white
        bordercolor='rgba(0,0,0,0.1)',  # Add a light border to the legend
        borderwidth=1
    ),
    dragmode='pan'
    )

    config = {
        'add': ['pan2d'],
        'remove': [
            'zoom2d', 
            'zoomIn2d', 
            'zoomOut2d', 
            'resetScale2d',
            'hoverClosestCartesian', 
            'hoverCompareCartesian',
            'toggleSpikelines',
            'lasso2d',
            'lasso',
            'select2d',
            'select',
        ]
    }
    
    # Apply the config to the figure
    fig.update_layout(modebar=config)
    
    return fig