Spaces:

seawolf2357
/

autogpt2

Build error

App Files Files Community

autogpt2 / classic /benchmark /reports /match_records.py

seawolf2357

Deploy from GitHub repository

3382f47 verified about 1 month ago

raw

history blame contribute delete

12.3 kB

	import glob
	import json
	import os
	from typing import Dict, List, Optional, Union

	import pandas as pd
	from gql import Client, gql
	from gql.transport.aiohttp import AIOHTTPTransport
	from pydantic import BaseModel, Field

	# from agbenchmark.reports.processing.report_types import Report, SuiteTest


	class Metrics(BaseModel):
	difficulty: str
	success: bool
	success_percent: float = Field(alias="success_%")
	run_time: Optional[str] = None
	fail_reason: Optional[str] = None
	attempted: Optional[bool] = None


	class MetricsOverall(BaseModel):
	run_time: str
	highest_difficulty: str
	percentage: Optional[float] = None


	class Test(BaseModel):
	data_path: str
	is_regression: bool
	answer: str
	description: str
	metrics: Metrics
	category: List[str]
	task: Optional[str] = None
	reached_cutoff: Optional[bool] = None


	class SuiteTest(BaseModel):
	data_path: str
	metrics: MetricsOverall
	tests: Dict[str, Test]
	category: Optional[List[str]] = None
	task: Optional[str] = None
	reached_cutoff: Optional[bool] = None


	class Report(BaseModel):
	command: str
	completion_time: str
	benchmark_start_time: str
	metrics: MetricsOverall
	tests: Dict[str, Union[Test, SuiteTest]]
	config: Dict[str, str \| dict[str, str]]


	def get_reports():
	# Initialize an empty list to store the report data
	report_data = []

	# Get the current working directory
	current_dir = os.getcwd()

	# Check if the current directory ends with 'reports'
	if current_dir.endswith("reports"):
	reports_dir = "/"
	else:
	reports_dir = "reports"

	# Iterate over all agent directories in the reports directory
	for agent_name in os.listdir(reports_dir):
	if agent_name is None:
	continue
	agent_dir = os.path.join(reports_dir, agent_name)

	# Check if the item is a directory (an agent directory)
	if os.path.isdir(agent_dir):
	# Construct the path to the report.json file
	# Get all directories and files, but note that this will also include any file, not just directories.
	run_dirs = glob.glob(os.path.join(agent_dir, "*"))

	# Get all json files starting with 'file'
	# old_report_files = glob.glob(os.path.join(agent_dir, "file*.json"))

	# For each run directory, add the report.json to the end
	# Only include the path if it's actually a directory
	report_files = [
	os.path.join(run_dir, "report.json")
	for run_dir in run_dirs
	if os.path.isdir(run_dir)
	]
	# old_report_files already contains the full paths, so no need to join again
	# report_files = report_files + old_report_files
	for report_file in report_files:
	# Check if the report.json file exists
	if os.path.isfile(report_file):
	# Open the report.json file
	with open(report_file, "r") as f:
	# Load the JSON data from the file
	json_data = json.load(f)
	print(f"Processing {report_file}")
	report = Report.model_validate(json_data)

	for test_name, test_data in report.tests.items():
	test_json = {
	"agent": agent_name.lower(),
	"benchmark_start_time": report.benchmark_start_time,
	}

	if isinstance(test_data, SuiteTest):
	if (
	test_data.category
	): # this means it's a same task test
	test_json["challenge"] = test_name
	test_json["attempted"] = test_data.tests[
	list(test_data.tests.keys())[0]
	].metrics.attempted
	test_json["categories"] = ", ".join(
	test_data.category
	)
	test_json["task"] = test_data.task
	test_json["success"] = test_data.metrics.percentage
	test_json[
	"difficulty"
	] = test_data.metrics.highest_difficulty
	test_json[
	"success_%"
	] = test_data.metrics.percentage
	test_json["run_time"] = test_data.metrics.run_time
	test_json["is_regression"] = test_data.tests[
	list(test_data.tests.keys())[0]
	].is_regression
	else: # separate tasks in 1 suite
	for (
	suite_test_name,
	suite_data,
	) in test_data.tests.items():
	test_json["challenge"] = suite_test_name
	test_json[
	"attempted"
	] = suite_data.metrics.attempted
	test_json["categories"] = ", ".join(
	suite_data.category
	)
	test_json["task"] = suite_data.task
	test_json["success"] = (
	100.0 if suite_data.metrics.success else 0
	)
	test_json[
	"difficulty"
	] = suite_data.metrics.difficulty
	test_json[
	"success_%"
	] = suite_data.metrics.success_percentage
	test_json[
	"run_time"
	] = suite_data.metrics.run_time
	test_json[
	"is_regression"
	] = suite_data.is_regression

	else:
	test_json["challenge"] = test_name
	test_json["attempted"] = test_data.metrics.attempted
	test_json["categories"] = ", ".join(test_data.category)
	test_json["task"] = test_data.task
	test_json["success"] = (
	100.0 if test_data.metrics.success else 0
	)
	test_json["difficulty"] = test_data.metrics.difficulty
	test_json[
	"success_%"
	] = test_data.metrics.success_percentage
	test_json["run_time"] = test_data.metrics.run_time
	test_json["is_regression"] = test_data.is_regression

	report_data.append(test_json)

	return pd.DataFrame(report_data)


	def get_helicone_data():
	helicone_api_key = os.getenv("HELICONE_API_KEY")

	url = "https://www.helicone.ai/api/graphql"
	# Replace <KEY> with your personal access key
	transport = AIOHTTPTransport(
	url=url, headers={"authorization": f"Bearer {helicone_api_key}"}
	)

	client = Client(transport=transport, fetch_schema_from_transport=True)

	SIZE = 250

	i = 0

	data = []
	print("Fetching data from Helicone")
	while True:
	query = gql(
	"""
	query ExampleQuery($limit: Int, $offset: Int){
	heliconeRequest(
	limit: $limit
	offset: $offset
	) {
	costUSD
	prompt
	properties{
	name
	value
	}

	requestBody
	response
	createdAt

	}

	}
	"""
	)
	print(f"Fetching {i * SIZE} to {(i + 1) * SIZE} records")
	try:
	result = client.execute(
	query, variable_values={"limit": SIZE, "offset": i * SIZE}
	)
	except Exception as e:
	print(f"Error occurred: {e}")
	result = None

	i += 1

	if result:
	for item in result["heliconeRequest"]:
	properties = {
	prop["name"]: prop["value"] for prop in item["properties"]
	}
	data.append(
	{
	"createdAt": item["createdAt"],
	"agent": properties.get("agent"),
	"costUSD": item["costUSD"],
	"job_id": properties.get("job_id"),
	"challenge": properties.get("challenge"),
	"benchmark_start_time": properties.get("benchmark_start_time"),
	"prompt": item["prompt"],
	"response": item["response"],
	"model": item["requestBody"].get("model"),
	"request": item["requestBody"].get("messages"),
	}
	)

	if not result or (len(result["heliconeRequest"]) == 0):
	print("No more results")
	break

	df = pd.DataFrame(data)
	# Drop rows where agent is None
	df = df.dropna(subset=["agent"])

	# Convert the remaining agent names to lowercase
	df["agent"] = df["agent"].str.lower()

	return df


	if os.path.exists("raw_reports.pkl") and os.path.exists("raw_helicone.pkl"):
	reports_df = pd.read_pickle("raw_reports.pkl")
	helicone_df = pd.read_pickle("raw_helicone.pkl")
	else:
	reports_df = get_reports()
	reports_df.to_pickle("raw_reports.pkl")
	helicone_df = get_helicone_data()
	helicone_df.to_pickle("raw_helicone.pkl")


	def try_formats(date_str):
	formats = ["%Y-%m-%d-%H:%M", "%Y-%m-%dT%H:%M:%S%z"]
	for fmt in formats:
	try:
	return pd.to_datetime(date_str, format=fmt)
	except ValueError:
	pass
	return None


	helicone_df["benchmark_start_time"] = pd.to_datetime(
	helicone_df["benchmark_start_time"].apply(try_formats), utc=True
	)
	helicone_df = helicone_df.dropna(subset=["benchmark_start_time"])
	helicone_df["createdAt"] = pd.to_datetime(
	helicone_df["createdAt"], unit="ms", origin="unix"
	)
	reports_df["benchmark_start_time"] = pd.to_datetime(
	reports_df["benchmark_start_time"].apply(try_formats), utc=True
	)
	reports_df = reports_df.dropna(subset=["benchmark_start_time"])

	assert pd.api.types.is_datetime64_any_dtype(
	helicone_df["benchmark_start_time"]
	), "benchmark_start_time in helicone_df is not datetime"
	assert pd.api.types.is_datetime64_any_dtype(
	reports_df["benchmark_start_time"]
	), "benchmark_start_time in reports_df is not datetime"

	reports_df["report_time"] = reports_df["benchmark_start_time"]

	# df = pd.merge_asof(
	# helicone_df.sort_values("benchmark_start_time"),
	# reports_df.sort_values("benchmark_start_time"),
	# left_on="benchmark_start_time",
	# right_on="benchmark_start_time",
	# by=["agent", "challenge"],
	# direction="backward",
	# )

	df = pd.merge(
	helicone_df,
	reports_df,
	on=["benchmark_start_time", "agent", "challenge"],
	how="inner",
	)

	df.to_pickle("df.pkl")
	print(df.info())
	print("Data saved to df.pkl")
	print("To load the data use: df = pd.read_pickle('df.pkl')")