Spaces:
Build error
Build error
| ########################################################################################################### | |
| # Adapted from https://github.com/TheAgentCompany/TheAgentCompany/blob/main/evaluation/summarise_results.py | |
| ########################################################################################################### | |
| import glob | |
| import json | |
| import os | |
| import re | |
| import sys | |
| def calculate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float: | |
| """ | |
| Calculate the cost of the model call. | |
| """ | |
| if 'claude-3-5-sonnet' in model.lower(): | |
| # https://www.anthropic.com/pricing#anthropic-api, accessed 12/11/2024 | |
| return 0.000003 * prompt_tokens + 0.000015 * completion_tokens | |
| elif 'gpt-4o' in model.lower(): | |
| # https://openai.com/api/pricing/, accessed 12/11/2024 | |
| return 0.0000025 * prompt_tokens + 0.00001 * completion_tokens | |
| elif 'gemini-1.5-pro' in model.lower(): | |
| # https://ai.google.dev/pricing#1_5pro, accessed 12/11/2024 | |
| # assuming prompts up to 128k tokens | |
| cost = 0.00000125 * prompt_tokens + 0.000005 * completion_tokens | |
| if prompt_tokens > 128000: | |
| cost *= 2 | |
| return cost | |
| elif 'gemini-2.0-flash-exp' in model.lower(): | |
| # price unknown for gemini-2.0-flash-exp, assuming same price as gemini-1.5-flash | |
| cost = 0.000000075 * prompt_tokens + 0.0000003 * completion_tokens | |
| if prompt_tokens > 128000: | |
| cost *= 2 | |
| return cost | |
| elif 'qwen2-72b' in model.lower(): | |
| # assuming hosted on Together | |
| # https://www.together.ai/pricing, accessed 12/11/2024 | |
| return 0.0000009 * (prompt_tokens + completion_tokens) | |
| elif 'qwen2p5-72b' in model.lower(): | |
| # assuming hosted on Together | |
| # https://www.together.ai/pricing, accessed 12/14/2024 | |
| return 0.0000012 * (prompt_tokens + completion_tokens) | |
| elif 'llama-v3p1-405b-instruct' in model.lower(): | |
| # assuming hosted on Fireworks AI | |
| # https://fireworks.ai/pricing, accessed 12/11/2024 | |
| return 0.000003 * (prompt_tokens + completion_tokens) | |
| elif 'llama-v3p1-70b-instruct' in model.lower(): | |
| # assuming hosted on Fireworks AI | |
| return 0.0000009 * (prompt_tokens + completion_tokens) | |
| elif 'llama-v3p3-70b-instruct' in model.lower(): | |
| # assuming hosted on Fireworks AI | |
| return 0.0000009 * (prompt_tokens + completion_tokens) | |
| elif 'amazon.nova-pro-v1:0' in model.lower(): | |
| # assuming hosted on Amazon Bedrock | |
| # https://aws.amazon.com/bedrock/pricing/, accessed 12/11/2024 | |
| return 0.0000008 * prompt_tokens + 0.0000032 * completion_tokens | |
| else: | |
| raise ValueError(f'Unknown model: {model}') | |
| def analyze_eval_json_file(filepath: str) -> tuple[int, int]: | |
| """ | |
| Analyze a single eval JSON file and extract the total and result from final_score. | |
| Args: | |
| filepath: Path to the JSON file | |
| Returns: | |
| Tuple containing (total, result) from final_score | |
| """ | |
| try: | |
| with open(filepath, 'r') as f: | |
| data = json.load(f) | |
| final_score = data.get('final_score', {}) | |
| return (final_score.get('total', 0), final_score.get('result', 0)) | |
| except json.JSONDecodeError as e: | |
| print(f'Error decoding JSON in {filepath}: {e}') | |
| return (0, 0) | |
| except Exception as e: | |
| print(f'Error processing {filepath}: {e}') | |
| return (0, 0) | |
| def analyze_traj_json_file(filepath: str) -> tuple[int, float]: | |
| """ | |
| Analyze a single trajectory JSON file and extract the steps and tokens | |
| for each step. Then estimate the cost based on the tokens and the model type. | |
| Note: this is assuming there's no prompt caching at all. | |
| """ | |
| steps: int = 0 | |
| cost: float = 0.0 | |
| with open(filepath, 'r') as f: | |
| data = json.load(f) | |
| response_id = None | |
| for action in data: | |
| if 'tool_call_metadata' in action: | |
| if action['tool_call_metadata']['model_response']['id'] != response_id: | |
| response_id = action['tool_call_metadata']['model_response']['id'] | |
| else: | |
| # openhands displays the same model response meta data multiple times, when | |
| # a single LLM call leads to multiple actions and observations. | |
| continue | |
| steps += 1 | |
| usage = action['tool_call_metadata']['model_response']['usage'] | |
| model: str = action['tool_call_metadata']['model_response']['model'] | |
| prompt_tokens = usage['prompt_tokens'] | |
| completion_tokens = usage['completion_tokens'] | |
| cost += calculate_cost(model, prompt_tokens, completion_tokens) | |
| return (steps, cost) | |
| def analyze_folder( | |
| folder_path: str, | |
| ) -> tuple[dict[str, tuple[int, int]], dict[str, tuple[int, float]]]: | |
| """ | |
| Analyze all eval_*.json & traj_*.json files in the specified folder. | |
| Args: | |
| folder_path: Path to the folder containing JSON files | |
| Returns: | |
| dictionaries: | |
| - eval_results: Dictionary with filename as key and (total, result) tuple as value | |
| - traj_results: Dictionary with filename as key and (steps, cost) tuple as value | |
| """ | |
| eval_results = {} | |
| traj_results = {} | |
| eval_pattern = os.path.join(folder_path, 'eval_*.json') | |
| traj_pattern = os.path.join(folder_path, 'traj_*.json') | |
| for filepath in glob.glob(eval_pattern): | |
| filename = os.path.basename(filepath) | |
| total, result = analyze_eval_json_file(filepath) | |
| key = re.search(r'eval_(.+)\.json', filename).group(1) | |
| eval_results[key] = (total, result) | |
| for filepath in glob.glob(traj_pattern): | |
| filename = os.path.basename(filepath) | |
| steps, cost = analyze_traj_json_file(filepath) | |
| key = re.search(r'traj_(.+)\.json', filename).group(1) | |
| traj_results[key] = (steps, cost) | |
| return eval_results, traj_results | |
| def get_task_nature_category(task_name: str) -> str: | |
| """ | |
| Get the nature category of the task. | |
| """ | |
| task_nature = task_name.split('-')[0] | |
| if task_nature.lower() in ['sde', 'pm', 'ds', 'admin', 'hr', 'finance']: | |
| return task_nature | |
| else: | |
| return 'other' | |
| def calculate_score(total: int, result: int) -> float: | |
| """ | |
| Calculate the score as a number between 0 and 1. | |
| Formula: score = (result / total) * 0.5 + (result // total) * 0.5 | |
| Explanation: | |
| - (result / total) * 0.5: This is the completion ratio, scaled down to a 0-0.5 range. | |
| - (result // total) * 0.5: This is a binary score indicating whether the task was completed or not. | |
| Args: | |
| total: Total possible points | |
| result: Actual points achieved | |
| Returns: | |
| Score as a number between 0 and 1 | |
| """ | |
| return (result / total * 0.5) + (result // total * 0.5) | |
| def is_perfect_completion(total: int, result: int) -> bool: | |
| """ | |
| Check if the task achieved perfect completion. | |
| Args: | |
| total: Total possible points | |
| result: Actual points achieved | |
| Returns: | |
| True if result equals total, False otherwise | |
| """ | |
| return total > 0 and total == result | |
| def main(): | |
| if len(sys.argv) != 2: | |
| print('Usage: poetry run python summarise_results.py <folder_path>') | |
| sys.exit(1) | |
| folder_path = sys.argv[1] | |
| if not os.path.isdir(folder_path): | |
| print(f"Error: '{folder_path}' is not a valid directory") | |
| sys.exit(1) | |
| eval_results, traj_results = analyze_folder(folder_path) | |
| if not eval_results: | |
| print(f'No eval_*.json files found in {folder_path}') | |
| return | |
| # Create list of results with completion ratios for sorting | |
| detailed_results = [ | |
| ( | |
| task_name, | |
| total, | |
| result, | |
| calculate_score(total, result), | |
| is_perfect_completion(total, result), | |
| get_task_nature_category(task_name), | |
| ) | |
| for task_name, (total, result) in eval_results.items() | |
| ] | |
| # Sort by score in descending order | |
| detailed_results.sort(key=lambda x: (-x[3], x[0])) | |
| # Calculate perfect completion stats | |
| perfect_completions = sum( | |
| 1 for _, _, _, _, is_perfect, _ in detailed_results if is_perfect | |
| ) | |
| # Print header | |
| print('\n# Evaluation Results Report') | |
| print('\n## Results per File') | |
| print('\n*Sorted by score (⭐ indicates perfect completion)*\n') | |
| # Print table header | |
| print( | |
| '| Filename | Total | Result | Score | Steps | Cost (assuming no prompt caching)|' | |
| ) | |
| print('|----------|--------|---------|-------|-------|------|') | |
| # Print individual file results | |
| for task_name, total, result, score, is_perfect, task_nature in detailed_results: | |
| perfect_marker = ' ⭐' if is_perfect else '' | |
| print( | |
| f'| {task_name} | {total:,} | {result:,} | {score:.2f}{perfect_marker} | {traj_results[task_name][0]} | {traj_results[task_name][1]:.2f} |' | |
| ) | |
| # Print summary section | |
| print('\n## Summary\n') | |
| print(f'**Tasks Evaluated:** {len(eval_results)}\n') | |
| print( | |
| f'**Perfect Completions:** {perfect_completions}/{len(eval_results)} ({(perfect_completions / len(eval_results) * 100):.2f}%)\n' | |
| ) | |
| overall_score = ( | |
| sum(score for _, _, _, score, _, _ in detailed_results) | |
| / len(detailed_results) | |
| * 100 | |
| ) | |
| avg_steps = sum(steps for steps, _ in traj_results.values()) / len(traj_results) | |
| avg_cost = sum(cost for _, cost in traj_results.values()) / len(traj_results) | |
| print(f'**Overall Score:** {overall_score:.2f}%\n') | |
| print(f'**Average Steps:** {avg_steps:.2f}\n') | |
| print(f'**Average Cost (USD):** {avg_cost:.2f}\n') | |
| # Additional statistics | |
| if detailed_results: | |
| highest_score = max(score for _, _, _, score, _, _ in detailed_results) | |
| lowest_score = min(score for _, _, _, score, _, _ in detailed_results) | |
| median_score = detailed_results[len(detailed_results) // 2][3] | |
| avg_score = sum(score for _, _, _, score, _, _ in detailed_results) / len( | |
| detailed_results | |
| ) | |
| print('\n## Statistics\n') | |
| print('| Metric | Value |') | |
| print('|---------|--------|') | |
| print(f'| Highest Task Score | {highest_score * 100:.2f}% |') | |
| print(f'| Lowest Task Score | {lowest_score * 100:.2f}% |') | |
| print(f'| Median Task Score | {median_score * 100:.2f}% |') | |
| print(f'| Average Task Score | {avg_score * 100:.2f}% |') | |
| # compute avg score per nature category | |
| print('\n## Statistics per Nature Category\n') | |
| print('| Metric | Value |') | |
| print('|---------|--------|') | |
| for task_nature in ['sde', 'pm', 'ds', 'admin', 'hr', 'finance', 'other']: | |
| num_of_tasks = sum( | |
| 1 | |
| for _, _, _, _, _, nature_category in detailed_results | |
| if nature_category == task_nature | |
| ) | |
| task_nature_score = ( | |
| sum( | |
| score | |
| for _, _, _, score, _, nature_category in detailed_results | |
| if nature_category == task_nature | |
| ) | |
| / num_of_tasks | |
| ) | |
| perfect_completions = sum( | |
| 1 | |
| for _, _, _, _, is_perfect, nature_category in detailed_results | |
| if nature_category == task_nature and is_perfect | |
| ) | |
| print( | |
| f'| Perfect Completions for {task_nature} | {perfect_completions}/{num_of_tasks} ({perfect_completions / num_of_tasks * 100:.2f}%) |' | |
| ) | |
| print( | |
| f'| Average Score for {task_nature} | {task_nature_score * 100:.2f}% |' | |
| ) | |
| if __name__ == '__main__': | |
| main() | |