Spaces:

seawolf2357
/

autogpt2

Build error

App Files Files Community

autogpt2 / classic /benchmark /reports /format.py

seawolf2357

Deploy from GitHub repository

3382f47 verified about 1 month ago

raw

history blame contribute delete

4.98 kB

	#!/usr/bin/env python3

	from pathlib import Path

	import click

	from agbenchmark.reports.processing.report_types import Report


	@click.command()
	@click.argument(
	"report_json_file", type=click.Path(exists=True, dir_okay=False, path_type=Path)
	)
	def print_markdown_report(report_json_file: Path):
	"""
	Generates a Markdown report from a given report.json file.

	:param report_json_file: Path to the report.json file.
	:return: A string containing the Markdown formatted report.
	"""
	report = Report.model_validate_json(report_json_file.read_text())

	# Header and metadata
	click.echo("# Benchmark Report")
	click.echo(f"- ⌛ Run time: `{report.metrics.run_time}`")
	click.echo(
	f" - Started at: `{report.benchmark_start_time[:16].replace('T', '` `')}`"
	)
	if report.completion_time:
	click.echo(
	f" - Completed at: `{report.completion_time[:16].replace('T', '` `')}`"
	)
	if report.metrics.total_cost:
	click.echo(f"- 💸 Total cost: `${round(report.metrics.total_cost, 2)}`")
	click.echo(
	f"- 🏅 Highest achieved difficulty: `{report.metrics.highest_difficulty}`"
	)
	click.echo(f"- ⚙️ Command: `{report.command}`")

	click.echo() # spacing

	# Aggregate information
	successful, failed, unreliable = [], [], []
	for test in report.tests.values():
	test.metrics.success_percentage = (
	rsp
	if (rsp := test.metrics.success_percentage) is not None
	else sum(float(r.success or 0) for r in test.results)
	* 100
	/ len(test.results)
	)
	if test.metrics.success_percentage == 100.0:
	successful.append(test)
	elif test.metrics.success_percentage == 0.0:
	failed.append(test)
	else:
	unreliable.append(test)

	# Summary
	click.echo("## Summary")
	click.echo(f"- `{len(successful)}` passed {'✅'*len(successful)}")
	click.echo(f"- `{len(failed)}` failed {'❌'*len(failed)}")
	click.echo(f"- `{len(unreliable)}` unreliable {'⚠️'*len(unreliable)}")

	click.echo() # spacing

	# Test results
	click.echo("## Challenges")
	for test_name, test in report.tests.items():
	click.echo() # spacing

	result_indicator = (
	"✅"
	if test.metrics.success_percentage == 100.0
	else "⚠️"
	if test.metrics.success_percentage > 0
	else "❌"
	)
	click.echo(
	f"### {test_name} {result_indicator if test.metrics.attempted else '❔'}"
	)
	click.echo(f"{test.description}")

	click.echo() # spacing

	click.echo(f"- Attempted: {'Yes 👍' if test.metrics.attempted else 'No 👎'}")
	click.echo(
	f"- Success rate: {round(test.metrics.success_percentage)}% "
	f"({len([r for r in test.results if r.success])}/{len(test.results)})"
	)
	click.echo(f"- Difficulty: `{test.difficulty}`")
	click.echo(f"- Categories: `{'`, `'.join(test.category)}`")
	click.echo(
	f"<details>\n<summary><strong>Task</strong> (click to expand)</summary>\n\n"
	f"{indent('> ', test.task)}\n\n"
	f"Reference answer:\n{indent('> ', test.answer)}\n"
	"</details>"
	)

	click.echo() # spacing

	click.echo("\n#### Attempts")
	for i, attempt in enumerate(test.results, 1):
	click.echo(
	f"\n{i}. {'✅ Passed' if attempt.success else '❌ Failed'} "
	f"in {attempt.run_time} "
	f"and {quantify('step', attempt.n_steps)}\n"
	)
	if attempt.cost is not None:
	click.echo(f" - Cost: `${round(attempt.cost, 3)}`")
	if attempt.fail_reason:
	click.echo(
	" - Failure reason:\n"
	+ indent(" > ", attempt.fail_reason)
	+ "\n"
	)
	if attempt.steps:
	click.echo(
	indent(
	3 * " ",
	"<details>\n<summary><strong>Steps</strong></summary>\n",
	)
	)
	for j, step in enumerate(attempt.steps, 1):
	click.echo()
	click.echo(
	indent(3 * " ", f"{j}. {indent(3*' ', step.output, False)}")
	)
	click.echo("\n</details>")


	def indent(indent: str, text: str, prefix_indent: bool = True) -> str:
	return (indent if prefix_indent else "") + text.replace("\n", "\n" + indent)


	def quantify(noun: str, count: int, plural_suffix: str = "s") -> str:
	if count == 1:
	return f"{count} {noun}"
	return f"{count} {noun}{plural_suffix}"


	if __name__ == "__main__":
	print_markdown_report()