Anas Awadalla
commited on
Commit
·
c6efeae
1
Parent(s):
dec2f9b
osworld baselines
Browse files- src/streamlit_app.py +8 -0
src/streamlit_app.py
CHANGED
@@ -206,6 +206,14 @@ def fetch_leaderboard_data():
|
|
206 |
# Create DataFrame
|
207 |
df = pd.DataFrame(results)
|
208 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
# Process checkpoints: for each base model, find the best checkpoint
|
210 |
if not df.empty:
|
211 |
# Group by dataset and base_model
|
|
|
206 |
# Create DataFrame
|
207 |
df = pd.DataFrame(results)
|
208 |
|
209 |
+
# Adjust evaluated results for osworld-g (do not touch baselines)
|
210 |
+
if not df.empty and 'dataset' in df.columns and 'overall_accuracy' in df.columns:
|
211 |
+
osworld_mask = df['dataset'] == 'osworld-g'
|
212 |
+
if osworld_mask.any():
|
213 |
+
df.loc[osworld_mask, 'overall_accuracy'] = (
|
214 |
+
df.loc[osworld_mask, 'overall_accuracy'] * 0.90425531914
|
215 |
+
)
|
216 |
+
|
217 |
# Process checkpoints: for each base model, find the best checkpoint
|
218 |
if not df.empty:
|
219 |
# Group by dataset and base_model
|