Spaces:
Build error
Build error
#!/usr/bin/env python3 | |
from pathlib import Path | |
import click | |
from agbenchmark.reports.processing.report_types import Report | |
def print_markdown_report(report_json_file: Path): | |
""" | |
Generates a Markdown report from a given report.json file. | |
:param report_json_file: Path to the report.json file. | |
:return: A string containing the Markdown formatted report. | |
""" | |
report = Report.model_validate_json(report_json_file.read_text()) | |
# Header and metadata | |
click.echo("# Benchmark Report") | |
click.echo(f"- β **Run time:** `{report.metrics.run_time}`") | |
click.echo( | |
f" - **Started at:** `{report.benchmark_start_time[:16].replace('T', '` `')}`" | |
) | |
if report.completion_time: | |
click.echo( | |
f" - **Completed at:** `{report.completion_time[:16].replace('T', '` `')}`" | |
) | |
if report.metrics.total_cost: | |
click.echo(f"- πΈ **Total cost:** `${round(report.metrics.total_cost, 2)}`") | |
click.echo( | |
f"- π **Highest achieved difficulty:** `{report.metrics.highest_difficulty}`" | |
) | |
click.echo(f"- βοΈ **Command:** `{report.command}`") | |
click.echo() # spacing | |
# Aggregate information | |
successful, failed, unreliable = [], [], [] | |
for test in report.tests.values(): | |
test.metrics.success_percentage = ( | |
rsp | |
if (rsp := test.metrics.success_percentage) is not None | |
else sum(float(r.success or 0) for r in test.results) | |
* 100 | |
/ len(test.results) | |
) | |
if test.metrics.success_percentage == 100.0: | |
successful.append(test) | |
elif test.metrics.success_percentage == 0.0: | |
failed.append(test) | |
else: | |
unreliable.append(test) | |
# Summary | |
click.echo("## Summary") | |
click.echo(f"- **`{len(successful)}` passed** {'β '*len(successful)}") | |
click.echo(f"- **`{len(failed)}` failed** {'β'*len(failed)}") | |
click.echo(f"- **`{len(unreliable)}` unreliable** {'β οΈ'*len(unreliable)}") | |
click.echo() # spacing | |
# Test results | |
click.echo("## Challenges") | |
for test_name, test in report.tests.items(): | |
click.echo() # spacing | |
result_indicator = ( | |
"β " | |
if test.metrics.success_percentage == 100.0 | |
else "β οΈ" | |
if test.metrics.success_percentage > 0 | |
else "β" | |
) | |
click.echo( | |
f"### {test_name} {result_indicator if test.metrics.attempted else 'β'}" | |
) | |
click.echo(f"{test.description}") | |
click.echo() # spacing | |
click.echo(f"- **Attempted:** {'Yes π' if test.metrics.attempted else 'No π'}") | |
click.echo( | |
f"- **Success rate:** {round(test.metrics.success_percentage)}% " | |
f"({len([r for r in test.results if r.success])}/{len(test.results)})" | |
) | |
click.echo(f"- **Difficulty:** `{test.difficulty}`") | |
click.echo(f"- **Categories:** `{'`, `'.join(test.category)}`") | |
click.echo( | |
f"<details>\n<summary><strong>Task</strong> (click to expand)</summary>\n\n" | |
f"{indent('> ', test.task)}\n\n" | |
f"Reference answer:\n{indent('> ', test.answer)}\n" | |
"</details>" | |
) | |
click.echo() # spacing | |
click.echo("\n#### Attempts") | |
for i, attempt in enumerate(test.results, 1): | |
click.echo( | |
f"\n{i}. **{'β Passed' if attempt.success else 'β Failed'}** " | |
f"in **{attempt.run_time}** " | |
f"and **{quantify('step', attempt.n_steps)}**\n" | |
) | |
if attempt.cost is not None: | |
click.echo(f" - **Cost:** `${round(attempt.cost, 3)}`") | |
if attempt.fail_reason: | |
click.echo( | |
" - **Failure reason:**\n" | |
+ indent(" > ", attempt.fail_reason) | |
+ "\n" | |
) | |
if attempt.steps: | |
click.echo( | |
indent( | |
3 * " ", | |
"<details>\n<summary><strong>Steps</strong></summary>\n", | |
) | |
) | |
for j, step in enumerate(attempt.steps, 1): | |
click.echo() | |
click.echo( | |
indent(3 * " ", f"{j}. {indent(3*' ', step.output, False)}") | |
) | |
click.echo("\n</details>") | |
def indent(indent: str, text: str, prefix_indent: bool = True) -> str: | |
return (indent if prefix_indent else "") + text.replace("\n", "\n" + indent) | |
def quantify(noun: str, count: int, plural_suffix: str = "s") -> str: | |
if count == 1: | |
return f"{count} {noun}" | |
return f"{count} {noun}{plural_suffix}" | |
if __name__ == "__main__": | |
print_markdown_report() | |