Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	
		Clémentine
		
	commited on
		
		
					Commit 
							
							·
						
						d350941
	
1
								Parent(s):
							
							35763fc
								
fix rounding
Browse files- app.py +3 -1
 - src/auto_leaderboard/load_results.py +2 -4
 
    	
        app.py
    CHANGED
    
    | 
         @@ -18,6 +18,8 @@ from src.assets.css_html_js import custom_css, get_window_url_params 
     | 
|
| 18 | 
         
             
            from src.utils_display import AutoEvalColumn, EvalQueueColumn, fields, styled_error, styled_warning, styled_message
         
     | 
| 19 | 
         
             
            from src.init import get_all_requested_models, load_all_info_from_hub
         
     | 
| 20 | 
         | 
| 
         | 
|
| 
         | 
|
| 21 | 
         
             
            # clone / pull the lmeh eval data
         
     | 
| 22 | 
         
             
            H4_TOKEN = os.environ.get("H4_TOKEN", None)
         
     | 
| 23 | 
         | 
| 
         @@ -91,7 +93,7 @@ def get_leaderboard_df(): 
     | 
|
| 91 | 
         | 
| 92 | 
         
             
                df = pd.DataFrame.from_records(all_data)
         
     | 
| 93 | 
         
             
                df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
         
     | 
| 94 | 
         
            -
                df = df[COLS]
         
     | 
| 95 | 
         | 
| 96 | 
         
             
                # filter out if any of the benchmarks have not been produced
         
     | 
| 97 | 
         
             
                df = df[has_no_nan_values(df, BENCHMARK_COLS)]
         
     | 
| 
         | 
|
| 18 | 
         
             
            from src.utils_display import AutoEvalColumn, EvalQueueColumn, fields, styled_error, styled_warning, styled_message
         
     | 
| 19 | 
         
             
            from src.init import get_all_requested_models, load_all_info_from_hub
         
     | 
| 20 | 
         | 
| 21 | 
         
            +
            pd.set_option('display.precision', 1)
         
     | 
| 22 | 
         
            +
             
     | 
| 23 | 
         
             
            # clone / pull the lmeh eval data
         
     | 
| 24 | 
         
             
            H4_TOKEN = os.environ.get("H4_TOKEN", None)
         
     | 
| 25 | 
         | 
| 
         | 
|
| 93 | 
         | 
| 94 | 
         
             
                df = pd.DataFrame.from_records(all_data)
         
     | 
| 95 | 
         
             
                df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
         
     | 
| 96 | 
         
            +
                df = df[COLS].round(decimals=2)
         
     | 
| 97 | 
         | 
| 98 | 
         
             
                # filter out if any of the benchmarks have not been produced
         
     | 
| 99 | 
         
             
                df = df[has_no_nan_values(df, BENCHMARK_COLS)]
         
     | 
    	
        src/auto_leaderboard/load_results.py
    CHANGED
    
    | 
         @@ -44,9 +44,7 @@ class EvalResult: 
     | 
|
| 44 | 
         
             
                    data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
         
     | 
| 45 | 
         
             
                    data_dict[AutoEvalColumn.dummy.name] = base_model
         
     | 
| 46 | 
         
             
                    data_dict[AutoEvalColumn.revision.name] = self.revision
         
     | 
| 47 | 
         
            -
                    data_dict[AutoEvalColumn.average.name] =  
     | 
| 48 | 
         
            -
                        sum([v for k, v in self.results.items()]) / 4.0, 1
         
     | 
| 49 | 
         
            -
                    )
         
     | 
| 50 | 
         | 
| 51 | 
         
             
                    for benchmark in BENCHMARKS:
         
     | 
| 52 | 
         
             
                        if benchmark not in self.results.keys():
         
     | 
| 
         @@ -95,7 +93,7 @@ def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]: 
     | 
|
| 95 | 
         
             
                    accs = np.array([v[metric] for k, v in data["results"].items() if benchmark in k])
         
     | 
| 96 | 
         
             
                    if accs.size == 0:
         
     | 
| 97 | 
         
             
                        continue
         
     | 
| 98 | 
         
            -
                    mean_acc =  
     | 
| 99 | 
         
             
                    eval_results.append(EvalResult(
         
     | 
| 100 | 
         
             
                        eval_name=result_key, org=org, model=model, revision=model_sha, results={benchmark: mean_acc}, #todo model_type=, weight_type=
         
     | 
| 101 | 
         
             
                    ))
         
     | 
| 
         | 
|
| 44 | 
         
             
                    data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
         
     | 
| 45 | 
         
             
                    data_dict[AutoEvalColumn.dummy.name] = base_model
         
     | 
| 46 | 
         
             
                    data_dict[AutoEvalColumn.revision.name] = self.revision
         
     | 
| 47 | 
         
            +
                    data_dict[AutoEvalColumn.average.name] = sum([v for k, v in self.results.items()]) / 4.0
         
     | 
| 
         | 
|
| 
         | 
|
| 48 | 
         | 
| 49 | 
         
             
                    for benchmark in BENCHMARKS:
         
     | 
| 50 | 
         
             
                        if benchmark not in self.results.keys():
         
     | 
| 
         | 
|
| 93 | 
         
             
                    accs = np.array([v[metric] for k, v in data["results"].items() if benchmark in k])
         
     | 
| 94 | 
         
             
                    if accs.size == 0:
         
     | 
| 95 | 
         
             
                        continue
         
     | 
| 96 | 
         
            +
                    mean_acc = np.mean(accs) * 100.0
         
     | 
| 97 | 
         
             
                    eval_results.append(EvalResult(
         
     | 
| 98 | 
         
             
                        eval_name=result_key, org=org, model=model, revision=model_sha, results={benchmark: mean_acc}, #todo model_type=, weight_type=
         
     | 
| 99 | 
         
             
                    ))
         
     |