Commit 
							
							Β·
						
						f3dc796
	
1
								Parent(s):
							
							29e37fd
								
update
Browse files- app.py +4 -6
 - src/assets/text_content.py +8 -6
 
    	
        app.py
    CHANGED
    
    | 
         @@ -24,12 +24,14 @@ ALL_COLUMNS_MAPPING = { 
     | 
|
| 24 | 
         
             
                # model
         
     | 
| 25 | 
         
             
                "Model": "Model π€",
         
     | 
| 26 | 
         
             
                "Arch": "Arch ποΈ",
         
     | 
| 27 | 
         
            -
                "Size": "Size  
     | 
| 28 | 
         
             
                # deployment settings
         
     | 
| 29 | 
         
             
                "backend.name": "Backend π",
         
     | 
| 30 | 
         
             
                "backend.torch_dtype": "Dtype π₯",
         
     | 
| 31 | 
         
             
                "optimizations": "Optimizations π οΈ",
         
     | 
| 32 | 
         
             
                "quantization": "Quantization ποΈ",
         
     | 
| 
         | 
|
| 
         | 
|
| 33 | 
         
             
                # throughput measurements
         
     | 
| 34 | 
         
             
                "decode.throughput(tokens/s)": "Decode Throughput (tokens/s) β¬οΈ",
         
     | 
| 35 | 
         
             
                "generate.throughput(tokens/s)": "E2E Throughput (tokens/s) β¬οΈ",
         
     | 
| 
         @@ -42,8 +44,6 @@ ALL_COLUMNS_MAPPING = { 
     | 
|
| 42 | 
         
             
                "generate.max_memory_used(MB)": "Used Memory (MB) β¬οΈ",
         
     | 
| 43 | 
         
             
                # energy measurements
         
     | 
| 44 | 
         
             
                "generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh) β¬οΈ",
         
     | 
| 45 | 
         
            -
                # quality measurements
         
     | 
| 46 | 
         
            -
                "Score": "Avg Score (%) β¬οΈ",
         
     | 
| 47 | 
         
             
            }
         
     | 
| 48 | 
         
             
            SORTING_COLUMN = ["Score", "generate.throughput(tokens/s)"]
         
     | 
| 49 | 
         
             
            SORTING_ASCENDING = [False, True]
         
     | 
| 
         @@ -148,9 +148,7 @@ def get_benchmark_chart(bench_df): 
     | 
|
| 148 | 
         
             
                copy_df = bench_df.copy()
         
     | 
| 149 | 
         
             
                # transform
         
     | 
| 150 | 
         
             
                copy_df["Arch ποΈ"] = copy_df["Arch ποΈ"].apply(process_model_arch)
         
     | 
| 151 | 
         
            -
                #  
     | 
| 152 | 
         
            -
                # copy_df = copy_df[copy_df["E2E Latency (s) β¬οΈ"] <= 100]
         
     | 
| 153 | 
         
            -
             
     | 
| 154 | 
         
             
                fig = px.scatter(
         
     | 
| 155 | 
         
             
                    copy_df,
         
     | 
| 156 | 
         
             
                    y="Avg Score (%) β¬οΈ",
         
     | 
| 
         | 
|
| 24 | 
         
             
                # model
         
     | 
| 25 | 
         
             
                "Model": "Model π€",
         
     | 
| 26 | 
         
             
                "Arch": "Arch ποΈ",
         
     | 
| 27 | 
         
            +
                "Size": "Size π",
         
     | 
| 28 | 
         
             
                # deployment settings
         
     | 
| 29 | 
         
             
                "backend.name": "Backend π",
         
     | 
| 30 | 
         
             
                "backend.torch_dtype": "Dtype π₯",
         
     | 
| 31 | 
         
             
                "optimizations": "Optimizations π οΈ",
         
     | 
| 32 | 
         
             
                "quantization": "Quantization ποΈ",
         
     | 
| 33 | 
         
            +
                # quality measurements
         
     | 
| 34 | 
         
            +
                "Score": "Avg Score (%) β¬οΈ",
         
     | 
| 35 | 
         
             
                # throughput measurements
         
     | 
| 36 | 
         
             
                "decode.throughput(tokens/s)": "Decode Throughput (tokens/s) β¬οΈ",
         
     | 
| 37 | 
         
             
                "generate.throughput(tokens/s)": "E2E Throughput (tokens/s) β¬οΈ",
         
     | 
| 
         | 
|
| 44 | 
         
             
                "generate.max_memory_used(MB)": "Used Memory (MB) β¬οΈ",
         
     | 
| 45 | 
         
             
                # energy measurements
         
     | 
| 46 | 
         
             
                "generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh) β¬οΈ",
         
     | 
| 
         | 
|
| 
         | 
|
| 47 | 
         
             
            }
         
     | 
| 48 | 
         
             
            SORTING_COLUMN = ["Score", "generate.throughput(tokens/s)"]
         
     | 
| 49 | 
         
             
            SORTING_ASCENDING = [False, True]
         
     | 
| 
         | 
|
| 148 | 
         
             
                copy_df = bench_df.copy()
         
     | 
| 149 | 
         
             
                # transform
         
     | 
| 150 | 
         
             
                copy_df["Arch ποΈ"] = copy_df["Arch ποΈ"].apply(process_model_arch)
         
     | 
| 151 | 
         
            +
                # plot
         
     | 
| 
         | 
|
| 
         | 
|
| 152 | 
         
             
                fig = px.scatter(
         
     | 
| 153 | 
         
             
                    copy_df,
         
     | 
| 154 | 
         
             
                    y="Avg Score (%) β¬οΈ",
         
     | 
    	
        src/assets/text_content.py
    CHANGED
    
    | 
         @@ -12,7 +12,7 @@ ABOUT_TEXT = """<h3>About the π€ LLM-Perf Leaderboard ποΈ</h3> 
     | 
|
| 12 | 
         
             
            <ul>
         
     | 
| 13 | 
         
             
                <li>To avoid communication-dependent results, only one GPU is used.</li>
         
     | 
| 14 | 
         
             
                <li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">π€ Open LLM Leaderboard</a>.</li>
         
     | 
| 15 | 
         
            -
                <li>LLMs are running on a singleton batch with a prompt size of  
     | 
| 16 | 
         
             
                <li>Peak memory is measured in MB during the generate pass using Py3NVML while assuring the GPU's isolation.</li>
         
     | 
| 17 | 
         
             
                <li>Energy consumption is measured in kWh using CodeCarbon and taking into consideration the GPU, CPU, RAM and location of the machine.</li>
         
     | 
| 18 | 
         
             
                <li>Each pair of (Model Type, Weight Class) is represented by the best scored model. This LLM is the one used for all the hardware/backend/optimization experiments.</li>
         
     | 
| 
         @@ -44,19 +44,21 @@ device: cuda 
     | 
|
| 44 | 
         | 
| 45 | 
         
             
            backend:
         
     | 
| 46 | 
         
             
              no_weights: true
         
     | 
| 47 | 
         
            -
              delete_cache: true
         
     | 
| 48 | 
         
             
              torch_dtype: float16
         
     | 
| 49 | 
         
            -
              quantization_strategy: gptq
         
     | 
| 50 | 
         
             
              bettertransformer: true
         
     | 
| 
         | 
|
| 
         | 
|
| 51 | 
         | 
| 52 | 
         
             
            benchmark:
         
     | 
| 53 | 
         
             
              memory: true
         
     | 
| 54 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 55 | 
         
             
              input_shapes:
         
     | 
| 56 | 
         
             
                batch_size: 1
         
     | 
| 57 | 
         
            -
                sequence_length:  
     | 
| 
         | 
|
| 58 | 
         | 
| 59 | 
         
            -
              new_tokens: 1000
         
     | 
| 60 | 
         
             
            ```
         
     | 
| 61 | 
         
             
            """
         
     | 
| 62 | 
         | 
| 
         | 
|
| 12 | 
         
             
            <ul>
         
     | 
| 13 | 
         
             
                <li>To avoid communication-dependent results, only one GPU is used.</li>
         
     | 
| 14 | 
         
             
                <li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">π€ Open LLM Leaderboard</a>.</li>
         
     | 
| 15 | 
         
            +
                <li>LLMs are running on a singleton batch with a prompt size of 256 and generating a 1000 tokens.</li>
         
     | 
| 16 | 
         
             
                <li>Peak memory is measured in MB during the generate pass using Py3NVML while assuring the GPU's isolation.</li>
         
     | 
| 17 | 
         
             
                <li>Energy consumption is measured in kWh using CodeCarbon and taking into consideration the GPU, CPU, RAM and location of the machine.</li>
         
     | 
| 18 | 
         
             
                <li>Each pair of (Model Type, Weight Class) is represented by the best scored model. This LLM is the one used for all the hardware/backend/optimization experiments.</li>
         
     | 
| 
         | 
|
| 44 | 
         | 
| 45 | 
         
             
            backend:
         
     | 
| 46 | 
         
             
              no_weights: true
         
     | 
| 
         | 
|
| 47 | 
         
             
              torch_dtype: float16
         
     | 
| 
         | 
|
| 48 | 
         
             
              bettertransformer: true
         
     | 
| 49 | 
         
            +
              quantization_scheme: gptq
         
     | 
| 50 | 
         
            +
             
     | 
| 51 | 
         | 
| 52 | 
         
             
            benchmark:
         
     | 
| 53 | 
         
             
              memory: true
         
     | 
| 54 | 
         
            +
              energy: true
         
     | 
| 55 | 
         
            +
              
         
     | 
| 56 | 
         
            +
              new_tokens: 1000
         
     | 
| 57 | 
         
             
              input_shapes:
         
     | 
| 58 | 
         
             
                batch_size: 1
         
     | 
| 59 | 
         
            +
                sequence_length: 256
         
     | 
| 60 | 
         
            +
             
     | 
| 61 | 
         | 
| 
         | 
|
| 62 | 
         
             
            ```
         
     | 
| 63 | 
         
             
            """
         
     | 
| 64 | 
         |