Spaces:
Build error
Build error
import datetime | |
import time | |
import pytest | |
import requests | |
URL_BENCHMARK = "http://localhost:8080/ap/v1" | |
URL_AGENT = "http://localhost:8000/ap/v1" | |
try: | |
response = requests.get(f"{URL_AGENT}/agent/tasks") | |
except requests.exceptions.ConnectionError: | |
pytest.skip("No agent available to test against", allow_module_level=True) | |
def test_entire_workflow( | |
eval_id: str, | |
input_text: str, | |
expected_artifact_length: int, | |
test_name: str, | |
should_be_successful: bool, | |
): | |
task_request = {"eval_id": eval_id, "input": input_text} | |
response = requests.get(f"{URL_AGENT}/agent/tasks") | |
task_count_before = response.json()["pagination"]["total_items"] | |
# First POST request | |
task_response_benchmark = requests.post( | |
URL_BENCHMARK + "/agent/tasks", json=task_request | |
) | |
response = requests.get(f"{URL_AGENT}/agent/tasks") | |
task_count_after = response.json()["pagination"]["total_items"] | |
assert task_count_after == task_count_before + 1 | |
timestamp_after_task_eval_created = datetime.datetime.now(datetime.timezone.utc) | |
time.sleep(1.1) # To make sure the 2 timestamps to compare are different | |
assert task_response_benchmark.status_code == 200 | |
task_response_benchmark = task_response_benchmark.json() | |
assert task_response_benchmark["input"] == input_text | |
task_response_benchmark_id = task_response_benchmark["task_id"] | |
response_task_agent = requests.get( | |
f"{URL_AGENT}/agent/tasks/{task_response_benchmark_id}" | |
) | |
assert response_task_agent.status_code == 200 | |
response_task_agent = response_task_agent.json() | |
assert len(response_task_agent["artifacts"]) == expected_artifact_length | |
step_request = {"input": input_text} | |
step_response = requests.post( | |
URL_BENCHMARK + "/agent/tasks/" + task_response_benchmark_id + "/steps", | |
json=step_request, | |
) | |
assert step_response.status_code == 200 | |
step_response = step_response.json() | |
assert step_response["is_last"] is True # Assuming is_last is always True | |
eval_response = requests.post( | |
URL_BENCHMARK + "/agent/tasks/" + task_response_benchmark_id + "/evaluations", | |
json={}, | |
) | |
assert eval_response.status_code == 200 | |
eval_response = eval_response.json() | |
print("eval_response") | |
print(eval_response) | |
assert eval_response["run_details"]["test_name"] == test_name | |
assert eval_response["metrics"]["success"] == should_be_successful | |
benchmark_start_time = datetime.datetime.fromisoformat( | |
eval_response["run_details"]["benchmark_start_time"] | |
) | |
assert benchmark_start_time < timestamp_after_task_eval_created | |