| LOGO = '<img src="https://huggingface.co/spaces/optimum/llm-perf-leaderboard/resolve/main/logo.png">' | |
| TITLE = """<h1 align="center" id="space-title">๐ค LLM-Perf Leaderboard ๐๏ธ</h1>""" | |
| INTRODUCTION = """ | |
| The ๐ค LLM-Perf Leaderboard ๐๏ธ aims to benchmark the performance (latency, throughput, memory & energy) of Large Language Models (LLMs) with different hardwares, backends and optimizations using [Optimum-Benchmark](https://github.com/huggingface/optimum-benchmark) and [Optimum](https://github.com/huggingface/optimum) flavors. | |
| Anyone from the community can request a model or a hardware/backend/optimization configuration for automated benchmarking: | |
| - Model evaluation requests should be made in the [๐ค Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and will be added to the [๐ค LLM Performance Leaderboard ๐๏ธ](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) automatically. | |
| - Hardware/Backend/Optimization performance requests should be made in the [community discussions](https://huggingface.co/spaces/optimum/llm-perf-leaderboard/discussions) to assess their relevance and feasibility. | |
| """ | |
| ABOUT = """<h3>About the ๐ค LLM-Perf Leaderboard ๐๏ธ</h3> | |
| <ul> | |
| <li>To avoid communication-dependent results, only one GPU is used.</li> | |
| <li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">๐ค Open LLM Leaderboard</a>.</li> | |
| <li>LLMs are running on a singleton batch with a prompt size of 256 and generating a 1000 tokens.</li> | |
| <li>Energy consumption is measured in kWh using CodeCarbon and taking into consideration the GPU, CPU, RAM and location of the machine.</li> | |
| <li>We measure three types of memory: Max Allocated Memory, Max Reserved Memory and Max Used Memory. The first two being reported by PyTorch and the last one being observed using PyNVML.</li> | |
| </ul> | |
| """ | |
| EXAMPLE_CONFIG = """ | |
| Here's an example of the configuration file used to benchmark the models with Optimum-Benchmark: | |
| ```yaml | |
| defaults: | |
| - backend: pytorch | |
| - _base_ # inheriting from base config | |
| - _self_ # for hydra 1.1 compatibility | |
| experiment_name: pytorch+cuda+float16+bettertransformer | |
| device: cuda | |
| backend: | |
| no_weights: true | |
| torch_dtype: float16 | |
| to_bettertransformer: true | |
| ``` | |
| Where the base config is: | |
| ```yaml | |
| defaults: | |
| - benchmark: inference # default benchmark | |
| - experiment # inheriting from experiment config | |
| - _self_ # for hydra 1.1 compatibility | |
| - override hydra/job_logging: colorlog # colorful logging | |
| - override hydra/hydra_logging: colorlog # colorful logging | |
| hydra: | |
| run: | |
| dir: ??? | |
| job: | |
| chdir: true | |
| env_set: | |
| CUDA_VISIBLE_DEVICES: 0 | |
| CUDA_DEVICE_ORDER: PCI_BUS_ID | |
| model: ??? | |
| experiment_name: ??? | |
| backend: | |
| initial_isolation_check: true | |
| continous_isolation_check: true | |
| benchmark: | |
| duration: 10 | |
| memory: true | |
| energy: true | |
| new_tokens: 1000 | |
| input_shapes: | |
| batch_size: 1 | |
| sequence_length: 256 | |
| hub_kwargs: | |
| trust_remote_code: true | |
| ``` | |
| """ | |
| CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results." | |
| CITATION_BUTTON = r"""@misc{llm-perf-leaderboard, | |
| author = {Ilyas Moutawwakil, Rรฉgis Pierrard}, | |
| title = {LLM-Perf Leaderboard}, | |
| year = {2023}, | |
| publisher = {Hugging Face}, | |
| howpublished = "\url{https://huggingface.co/spaces/optimum/llm-perf-leaderboard}", | |
| @software{optimum-benchmark, | |
| author = {Ilyas Moutawwakil, Rรฉgis Pierrard}, | |
| publisher = {Hugging Face}, | |
| title = {Optimum-Benchmark: A framework for benchmarking the performance of Transformers models with different hardwares, backends and optimizations.}, | |
| } | |
| """ | |