Spaces:
Build error
Build error
File size: 4,979 Bytes
3382f47 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
#!/usr/bin/env python3
from pathlib import Path
import click
from agbenchmark.reports.processing.report_types import Report
@click.command()
@click.argument(
"report_json_file", type=click.Path(exists=True, dir_okay=False, path_type=Path)
)
def print_markdown_report(report_json_file: Path):
"""
Generates a Markdown report from a given report.json file.
:param report_json_file: Path to the report.json file.
:return: A string containing the Markdown formatted report.
"""
report = Report.model_validate_json(report_json_file.read_text())
# Header and metadata
click.echo("# Benchmark Report")
click.echo(f"- β **Run time:** `{report.metrics.run_time}`")
click.echo(
f" - **Started at:** `{report.benchmark_start_time[:16].replace('T', '` `')}`"
)
if report.completion_time:
click.echo(
f" - **Completed at:** `{report.completion_time[:16].replace('T', '` `')}`"
)
if report.metrics.total_cost:
click.echo(f"- πΈ **Total cost:** `${round(report.metrics.total_cost, 2)}`")
click.echo(
f"- π
**Highest achieved difficulty:** `{report.metrics.highest_difficulty}`"
)
click.echo(f"- βοΈ **Command:** `{report.command}`")
click.echo() # spacing
# Aggregate information
successful, failed, unreliable = [], [], []
for test in report.tests.values():
test.metrics.success_percentage = (
rsp
if (rsp := test.metrics.success_percentage) is not None
else sum(float(r.success or 0) for r in test.results)
* 100
/ len(test.results)
)
if test.metrics.success_percentage == 100.0:
successful.append(test)
elif test.metrics.success_percentage == 0.0:
failed.append(test)
else:
unreliable.append(test)
# Summary
click.echo("## Summary")
click.echo(f"- **`{len(successful)}` passed** {'β
'*len(successful)}")
click.echo(f"- **`{len(failed)}` failed** {'β'*len(failed)}")
click.echo(f"- **`{len(unreliable)}` unreliable** {'β οΈ'*len(unreliable)}")
click.echo() # spacing
# Test results
click.echo("## Challenges")
for test_name, test in report.tests.items():
click.echo() # spacing
result_indicator = (
"β
"
if test.metrics.success_percentage == 100.0
else "β οΈ"
if test.metrics.success_percentage > 0
else "β"
)
click.echo(
f"### {test_name} {result_indicator if test.metrics.attempted else 'β'}"
)
click.echo(f"{test.description}")
click.echo() # spacing
click.echo(f"- **Attempted:** {'Yes π' if test.metrics.attempted else 'No π'}")
click.echo(
f"- **Success rate:** {round(test.metrics.success_percentage)}% "
f"({len([r for r in test.results if r.success])}/{len(test.results)})"
)
click.echo(f"- **Difficulty:** `{test.difficulty}`")
click.echo(f"- **Categories:** `{'`, `'.join(test.category)}`")
click.echo(
f"<details>\n<summary><strong>Task</strong> (click to expand)</summary>\n\n"
f"{indent('> ', test.task)}\n\n"
f"Reference answer:\n{indent('> ', test.answer)}\n"
"</details>"
)
click.echo() # spacing
click.echo("\n#### Attempts")
for i, attempt in enumerate(test.results, 1):
click.echo(
f"\n{i}. **{'β
Passed' if attempt.success else 'β Failed'}** "
f"in **{attempt.run_time}** "
f"and **{quantify('step', attempt.n_steps)}**\n"
)
if attempt.cost is not None:
click.echo(f" - **Cost:** `${round(attempt.cost, 3)}`")
if attempt.fail_reason:
click.echo(
" - **Failure reason:**\n"
+ indent(" > ", attempt.fail_reason)
+ "\n"
)
if attempt.steps:
click.echo(
indent(
3 * " ",
"<details>\n<summary><strong>Steps</strong></summary>\n",
)
)
for j, step in enumerate(attempt.steps, 1):
click.echo()
click.echo(
indent(3 * " ", f"{j}. {indent(3*' ', step.output, False)}")
)
click.echo("\n</details>")
def indent(indent: str, text: str, prefix_indent: bool = True) -> str:
return (indent if prefix_indent else "") + text.replace("\n", "\n" + indent)
def quantify(noun: str, count: int, plural_suffix: str = "s") -> str:
if count == 1:
return f"{count} {noun}"
return f"{count} {noun}{plural_suffix}"
if __name__ == "__main__":
print_markdown_report()
|