core_leaderboard

Running

App Files Files Community

benediktstroebl commited on Aug 11, 2024

Commit

cd69490

1 Parent(s): 3e874db

format update and added monitor llm client backend

Browse files

Files changed (4) hide show

agent_monitor/monitor.py +170 -0
utils/data.py +12 -1
utils/processing.py +25 -10
utils/viz.py +34 -6

agent_monitor/monitor.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import asyncio
+from openai import AsyncOpenAI
+from collections import defaultdict
+import weave
+from pydantic import BaseModel
+from abc import ABC, abstractmethod
+import json
+class StepAnalysis(BaseModel):
+    description: str
+    action_type: str
+    assessment: str
+    success: bool
+    headline: str
+class TaskSummary(BaseModel):
+    overview: str
+    key_successes: str
+    main_challenges: str
+    overall_assessment: str
+def get_weave_calls(client):
+    calls = client.calls()
+    processed_calls = []
+    for call in calls:
+        ChatCompletion = weave.ref(call.output).get()
+        choices = [choice.message.content for choice in ChatCompletion.choices]
+        output = {
+            'weave_task_id': call.attributes['weave_task_id'],
+            'trace_id': call.trace_id,
+            'project_id': call.project_id,
+            'created_timestamp': ChatCompletion.created,
+            'inputs': dict(call.inputs),
+            'id': call.id,
+            'outputs': {'choices' : choices},
+            'exception': call.exception,
+            'summary': call.summary,
+            'display_name': call.display_name,
+            'attributes': dict(call.attributes),
+            "_children": call._children,
+            '_feedback': call._feedback,
+        }
+        processed_calls.append(output)
+    return processed_calls
+class AsyncLLMClient(ABC):
+    @abstractmethod
+    async def generate_text(self, prompt, system_message=None, response_format=None):
+        pass
+class AsyncOpenAIClient(AsyncLLMClient):
+    def __init__(self, model="gpt-4o-mini"):
+        self.model = model
+        self.client = AsyncOpenAI()
+    async def generate_text(self, prompt, system_message=None, response_format=None):
+        messages = [
+            {"role": "system", "content": system_message or "You are a helpful AI assistant."},
+            {"role": "user", "content": prompt}
+        ]
+        response = await self.client.beta.chat.completions.parse(model=self.model, messages=messages, response_format=response_format)
+        return response.choices[0].message.content
+async def analyze_agent_steps(processed_calls, llm_client):
+    task_calls = defaultdict(list)
+    for call in processed_calls:
+        task_calls[call['weave_task_id']].append(call)
+    for task_id in task_calls:
+        task_calls[task_id].sort(key=lambda x: x['created_timestamp'])
+    tasks = [analyze_task(calls, llm_client) for task_id, calls in task_calls.items()]
+    task_analyses = await asyncio.gather(*tasks)
+    return dict(zip(task_calls.keys(), task_analyses))
+async def analyze_task(calls, llm_client):
+    step_tasks = [analyze_step(call, i+1, len(calls), llm_client) for i, call in enumerate(calls)]
+    steps = await asyncio.gather(*step_tasks)
+    try:
+        task_analysis = await summarize_task(steps, llm_client)
+        return {
+            'steps': steps,
+            'task_analysis': task_analysis
+        }
+    except Exception as e:
+        print(f"Error in task summarization: {str(e)}")
+        return TaskSummary(
+            overview="Summary generation failed",
+            key_successes=[],
+            main_challenges=[],
+            overall_assessment="Unable to assess due to error"
+        )
+async def analyze_step(call, step_number, total_steps, llm_client):
+    prompt = f"""
+            Analyze Step {step_number}/{total_steps} of AI agent task:
+            Input: {call['inputs']}
+            Output: {call['outputs']}
+            Exception: {call['exception']}
+            Summary: {call['summary']}
+            Provide an analysis with:
+            1. A brief description of the agent's action.
+            2. Classify the action as one of: 'plan', 'tool', 'retrieve', or 'other'.
+            3. Give a brief evaluation of progress, obstacles, or errors.
+            4. Indicate if the agent successfully completed its intended action.
+            5. Write a concise headline summarizing the agent's action that is ideally less than 7 words long.
+            Ensure accuracy and conciseness. Be specific and avoid too high-level descriptions.
+            """
+    system_message = "You are an expert AI system analyst, skilled in categorizing and evaluating AI agent actions."
+    analysis = await llm_client.generate_text(prompt, system_message, response_format=StepAnalysis)
+    try:
+        analysis = json.loads(analysis)
+    except json.JSONDecodeError:
+        print(f"Error parsing analysis for step {step_number} of {total_steps} in task {call['weave_task_id']}. Using default values.")
+        analysis = print(f"Error in analysis for step {step_number} of {total_steps} in task {call['weave_task_id']}: {str(e)}")
+        analysis = StepAnalysis(
+            description="Analysis failed",
+            category='other',
+            success=False,
+            assessment="Unable to assess due to error"
+        )
+    return {
+        'call_data': call,
+        'analysis': analysis
+    }
+async def summarize_task(steps, llm_client):
+    steps_summary = "\n".join([f"Step {i+1}: {step['analysis']}" for i, step in enumerate(steps)])
+    prompt = f"""
+        Analyze the following AI agent task steps:
+        {steps_summary}
+        Provide a summary with:
+        1. A concise overview of the agent's approach.
+        2. Main achievements or breakthroughs.
+        3. Primary obstacles or errors encountered.
+        4. A brief evaluation of the agent's overall performance.
+        Focus on patterns in the agent's approach and effectiveness. Be concise and insightful.
+        """
+    system_message = "You are an expert AI performance analyst, skilled in evaluating and summarizing AI agent task execution."
+    analysis = await llm_client.generate_text(prompt, system_message, response_format=TaskSummary)
+    return json.loads(analysis)
+# async def main():
+#     client = weave.init("citp_agent_eval/usaco_1723148990")
+#     processed_calls = get_weave_calls(client)
+#     weave.finish()
+#     openai_client = AsyncOpenAIClient(model="gpt-4o-mini")
+#     task_analyses_openai = await analyze_agent_steps(processed_calls, openai_client)
+#     with open("task_analyses.json", "w") as f:
+#         json.dump(task_analyses_openai, f, indent=4)
+# if __name__ == "__main__":
+#     asyncio.run(main())

utils/data.py CHANGED Viewed

@@ -47,7 +47,18 @@ def parse_json_files(folder_path, benchmark_name):
     # round all float columns to 2 decimal places
     for column in df.select_dtypes(include='float').columns:
-        df[column] = df[column].round(2)
     return df

     # round all float columns to 2 decimal places
     for column in df.select_dtypes(include='float').columns:
+        df[column] = df[column].round(3)
+    # Rename columns
+    df = df.rename(columns={
+        'agent_name': 'Agent Name',
+        'results_total_cost': 'Total Cost',
+        'results_accuracy': 'Accuracy',
+        'results_precision': 'Precision',
+        'results_recall': 'Recall',
+        'results_f1_score': 'F1 Score',
+        'results_auc': 'AUC',
+    })
     return df

utils/processing.py CHANGED Viewed

@@ -7,6 +7,7 @@ from email.mime.multipart import MIMEMultipart
 import asyncio
 import aiofiles
 import aiosmtplib
 async def check_and_process_uploads():
     upload_dir =  "evals_upload"
@@ -36,8 +37,9 @@ async def check_and_process_uploads():
             with open(processed_path, 'r') as f:
                 processed_data = json.load(f)
-            if new_data != processed_data:
-                unprocessed_uploads.append(upload)
             print(f"Upload {upload} is already in processed directory.")
         elif os.path.exists(live_path):
             with open(upload_path, 'r') as f:
@@ -46,8 +48,8 @@ async def check_and_process_uploads():
             with open(live_path, 'r') as f:
                 live_data = json.load(f)
-            if new_data != live_data:
-                unprocessed_uploads.append(upload)
             print(f"Upload {upload} is already in live directory.")
         else:
             unprocessed_uploads.append(upload)
@@ -71,12 +73,12 @@ async def process_single_upload(upload_path, processed_path):
         await process_upload(upload_path, processed_path)
         # Move the file to processed directory
-        await asyncio.to_thread(shutil.move, upload_path, processed_path)
         # Send email notification
         # await send_email_notification(upload_path.name, check_result, "Processing successful")
     else:
-        pass
         # Send email notification about the failed check
         # await send_email_notification(upload_path.name, check_result, "Upload check failed")
@@ -110,10 +112,23 @@ async def check_upload_structure(file_path):
 async def process_upload(input_path, output_path):
-    # Placeholder for processing logic
-    # For now, we'll just copy the file
-    await asyncio.to_thread(shutil.copy, input_path, output_path)
-    print(f"Processed {input_path} to {output_path}")
 async def send_email_notification(filename, check_result, status):
     sender_email = "[email protected]"

 import asyncio
 import aiofiles
 import aiosmtplib
+from agent_monitor.monitor import analyze_agent_steps, AsyncOpenAIClient
 async def check_and_process_uploads():
     upload_dir =  "evals_upload"
             with open(processed_path, 'r') as f:
                 processed_data = json.load(f)
+            # TODO we can use a better comparison method with exact comparison
+            # if new_data != processed_data:
+            #     unprocessed_uploads.append(upload)
             print(f"Upload {upload} is already in processed directory.")
         elif os.path.exists(live_path):
             with open(upload_path, 'r') as f:
             with open(live_path, 'r') as f:
                 live_data = json.load(f)
+            # if new_data != live_data:
+            #     unprocessed_uploads.append(upload)
             print(f"Upload {upload} is already in live directory.")
         else:
             unprocessed_uploads.append(upload)
         await process_upload(upload_path, processed_path)
         # Move the file to processed directory
+        # await asyncio.to_thread(shutil.move, upload_path, processed_path)
         # Send email notification
         # await send_email_notification(upload_path.name, check_result, "Processing successful")
     else:
+        print(f"Upload check failed for {upload_path}: {check_result['message']}")
         # Send email notification about the failed check
         # await send_email_notification(upload_path.name, check_result, "Upload check failed")
 async def process_upload(input_path, output_path):
+    print(f"Processing {input_path}...")
+    # load the file
+    with open(input_path, 'r') as f:
+        data = json.loads(f.read())
+    assert 'raw_logging_results' in data, "raw_logging_results key not found in the file"
+    openai_client = AsyncOpenAIClient(model="gpt-4o-mini")
+    processed_calls = await analyze_agent_steps(data['raw_logging_results'][:2], openai_client)
+    # Save the processed data
+    data['raw_logging_results'] = processed_calls
+    with open(output_path, 'w') as f:
+        json.dump(data, f, indent=4)
+    print(f"Processing of {input_path} successful. Results saved to {output_path}")
 async def send_email_notification(filename, check_result, status):
     sender_email = "[email protected]"

utils/viz.py CHANGED Viewed

@@ -5,15 +5,23 @@ import plotly.graph_objects as go
 import textwrap
 def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str = None, hover_data: list = None):
-    agents = [Agent(row.results_total_cost, row.results_accuracy) for row in df.itertuples()]
     pareto_frontier = compute_pareto_frontier(agents)
     fig = px.scatter(df,
                      x=x,
                      y=y,
-                     hover_data=hover_data,
-                     )
     # Sort the Pareto frontier points by x-coordinate
@@ -32,7 +40,7 @@ def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str =
     fig.update_xaxes(rangemode="tozero")
     fig.update_layout(
-    width = 600,
     height = 500,
     xaxis_title = x_label,
     yaxis_title = y_label,
@@ -52,7 +60,27 @@ def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str =
         xanchor="right",
         x=0.98,
         bgcolor="rgba(255, 255, 255, 0.5)"  # semi-transparent white background
-        )
     )
     return fig

 import textwrap
 def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str = None, hover_data: list = None):
+    agents = [Agent(row['Total Cost'], row['Accuracy']) for i, row in df.iterrows()]
     pareto_frontier = compute_pareto_frontier(agents)
     fig = px.scatter(df,
                      x=x,
                      y=y,
+                     custom_data=hover_data)
+    fig.update_traces(
+            hovertemplate="<br>".join([
+                "<b>Agent</b>: %{customdata[0]}",
+                "<b>Total Cost</b>: $%{x:.1f}",
+                "<b>Accuracy</b>: %{y:.1%}",
+            ])
+        )
+    fig.update_traces(marker=dict(size=10, color='#1b9e77'),
+                      hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"),)
     # Sort the Pareto frontier points by x-coordinate
     fig.update_xaxes(rangemode="tozero")
     fig.update_layout(
+    width = 1400,
     height = 500,
     xaxis_title = x_label,
     yaxis_title = y_label,
         xanchor="right",
         x=0.98,
         bgcolor="rgba(255, 255, 255, 0.5)"  # semi-transparent white background
+        ),
+    modebar=dict(
+            activecolor='#1f77b4',  # Color of active tool
+            orientation='h',  # Vertical orientation
+            bgcolor='rgba(255,255,255,0.8)',  # Slightly transparent white background
+            color='#777',  # Color of inactive tools
+            add = ['pan2d'],
+            remove = [
+                'zoom2d',
+                'zoomIn2d',
+                'zoomOut2d',
+                'resetScale2d',
+                'hoverClosestCartesian',
+                'hoverCompareCartesian',
+                'toggleSpikelines',
+                'lasso2d',
+                'lasso',
+                'select2d',
+                'select']
+        ),
+    dragmode='pan'
     )
     return fig