Spaces:

seawolf2357
/

autogpt2

Build error

App Files Files Community

autogpt2 / classic /benchmark /agbenchmark /challenges /base.py

seawolf2357

Deploy from GitHub repository

3382f47 verified about 1 month ago

raw

history blame contribute delete

3.34 kB

	import logging
	from abc import ABC, abstractmethod
	from pathlib import Path
	from typing import AsyncIterator, Awaitable, ClassVar, Optional

	import pytest
	from agent_protocol_client import AgentApi, Step
	from colorama import Fore, Style
	from pydantic import BaseModel, Field

	from agbenchmark.config import AgentBenchmarkConfig
	from agbenchmark.utils.data_types import Category, DifficultyLevel, EvalResult

	logger = logging.getLogger(__name__)


	class ChallengeInfo(BaseModel):
	eval_id: str = ""
	name: str
	task: str
	task_artifacts_dir: Optional[Path] = None
	category: list[Category]
	difficulty: Optional[DifficultyLevel] = None
	description: Optional[str] = None
	dependencies: list[str] = Field(default_factory=list)
	reference_answer: Optional[str]

	source_uri: str
	"""Internal reference indicating the source of the challenge specification"""

	available: bool = True
	unavailable_reason: str = ""


	class BaseChallenge(ABC):
	"""
	The base class and shared interface for all specific challenge implementations.
	"""

	info: ClassVar[ChallengeInfo]

	@classmethod
	@abstractmethod
	def from_source_uri(cls, source_uri: str) -> type["BaseChallenge"]:
	"""
	Construct an individual challenge subclass from a suitable `source_uri` (as in
	`ChallengeInfo.source_uri`).
	"""
	...

	@abstractmethod
	def test_method(
	self,
	config: AgentBenchmarkConfig,
	request: pytest.FixtureRequest,
	i_attempt: int,
	) -> None \| Awaitable[None]:
	"""
	Test method for use by Pytest-based benchmark sessions. Should return normally
	if the challenge passes, and raise a (preferably descriptive) error otherwise.
	"""
	...

	@classmethod
	async def run_challenge(
	cls, config: AgentBenchmarkConfig, timeout: int, *, mock: bool = False
	) -> AsyncIterator[Step]:
	"""
	Runs the challenge on the subject agent with the specified timeout.
	Also prints basic challenge and status info to STDOUT.

	Params:
	config: The subject agent's benchmark config.
	timeout: Timeout (seconds) after which to stop the run if not finished.

	Yields:
	Step: The steps generated by the agent for the challenge task.
	"""
	# avoid circular import
	from agbenchmark.agent_api_interface import run_api_agent

	print()
	print(
	f"{Fore.MAGENTA + Style.BRIGHT}{'='*24} "
	f"Starting {cls.info.name} challenge"
	f" {'='*24}{Style.RESET_ALL}"
	)
	print(f"{Fore.CYAN}Timeout:{Fore.RESET} {timeout} seconds")
	print(f"{Fore.CYAN}Task:{Fore.RESET} {cls.info.task}")

	print()
	logger.debug(f"Starting {cls.info.name} challenge run")
	i = 0
	async for step in run_api_agent(
	cls.info.task, config, timeout, cls.info.task_artifacts_dir, mock=mock
	):
	i += 1
	print(f"[{cls.info.name}] - step {step.name} ({i}. request)")
	yield step
	logger.debug(f"Finished {cls.info.name} challenge run")

	@classmethod
	@abstractmethod
	async def evaluate_task_state(
	cls, agent: AgentApi, task_id: str
	) -> list[EvalResult]:
	...