Spaces:
Runtime error
Runtime error
rusticluftig
commited on
Commit
·
838067a
1
Parent(s):
9b87de8
Update leaderboard for multi evals
Browse files
app.py
CHANGED
|
@@ -79,11 +79,7 @@ def main():
|
|
| 79 |
gr.HTML(EVALUATION_HEADER)
|
| 80 |
show_stale = gr.Checkbox(label="Show Stale", interactive=True)
|
| 81 |
competition_leaderboards = []
|
| 82 |
-
comp_2 = competitions.COMPETITION_DETAILS[2]
|
| 83 |
-
# Covert the losses into % of correct answers.
|
| 84 |
-
losses_2["losses"] = losses_2["losses"].apply(
|
| 85 |
-
lambda x: 1 - x if x else None
|
| 86 |
-
)
|
| 87 |
with gr.Accordion(f"{comp_2.name} Competition"):
|
| 88 |
gr.HTML(comp_2.html_description)
|
| 89 |
competition_leaderboards.append(
|
|
@@ -94,7 +90,7 @@ def main():
|
|
| 94 |
headers=[
|
| 95 |
"Name",
|
| 96 |
"Win Rate",
|
| 97 |
-
"
|
| 98 |
"Weight",
|
| 99 |
"UID",
|
| 100 |
"Block",
|
|
@@ -117,18 +113,25 @@ def main():
|
|
| 117 |
x="timestamp",
|
| 118 |
x_title="Date",
|
| 119 |
y="losses",
|
| 120 |
-
y_title="
|
| 121 |
interactive=True,
|
| 122 |
visible=True,
|
| 123 |
width=1024,
|
| 124 |
-
title="Best
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
)
|
| 126 |
gr.HTML(
|
| 127 |
"""
|
| 128 |
<ul><li><b>Name:</b> the 🤗 Hugging Face repo (click to go to the model card)</li>
|
| 129 |
<li><b>Win Rate:</b> % of head-to-head evals won vs. other eval'd models, given an epsilon advantage or disadvantage</li>
|
| 130 |
-
<li><b>
|
| 131 |
-
<li><b>MC Score:</b> the % of correct multiple choice answers given by the model as calculated by the OTF validator (higher is better)</li>
|
| 132 |
<li><b>UID:</b> the Bittensor UID of the miner</li>
|
| 133 |
<li><b>Weight:</b> the bittensor weight set for this model</li>
|
| 134 |
<li><b>Block:</b> the Bittensor block that the model was submitted in</li></ul><br/>More stats on <a href="https://taostats.io/subnets/netuid-37/" target="_blank">taostats</a>."""
|
|
|
|
| 79 |
gr.HTML(EVALUATION_HEADER)
|
| 80 |
show_stale = gr.Checkbox(label="Show Stale", interactive=True)
|
| 81 |
competition_leaderboards = []
|
| 82 |
+
comp_2 = competitions.COMPETITION_DETAILS[2]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
with gr.Accordion(f"{comp_2.name} Competition"):
|
| 84 |
gr.HTML(comp_2.html_description)
|
| 85 |
competition_leaderboards.append(
|
|
|
|
| 90 |
headers=[
|
| 91 |
"Name",
|
| 92 |
"Win Rate",
|
| 93 |
+
"Score",
|
| 94 |
"Weight",
|
| 95 |
"UID",
|
| 96 |
"Block",
|
|
|
|
| 113 |
x="timestamp",
|
| 114 |
x_title="Date",
|
| 115 |
y="losses",
|
| 116 |
+
y_title="Score",
|
| 117 |
interactive=True,
|
| 118 |
visible=True,
|
| 119 |
width=1024,
|
| 120 |
+
title="Best Score Over Time",
|
| 121 |
+
)
|
| 122 |
+
gr.HTML(
|
| 123 |
+
"""
|
| 124 |
+
The definition of score changes over time as new evaluation tasks are added in releases.
|
| 125 |
+
<ul>
|
| 126 |
+
<li><b>Start-Oct 27</b>: % of wrong answers on synthetic MMLU</li>
|
| 127 |
+
<li><b>Oct 27-Now</b>: + word sorting eval</li>
|
| 128 |
+
"""
|
| 129 |
)
|
| 130 |
gr.HTML(
|
| 131 |
"""
|
| 132 |
<ul><li><b>Name:</b> the 🤗 Hugging Face repo (click to go to the model card)</li>
|
| 133 |
<li><b>Win Rate:</b> % of head-to-head evals won vs. other eval'd models, given an epsilon advantage or disadvantage</li>
|
| 134 |
+
<li><b>Score:</b> the combined model score as calculated by the OTF validator (lower is better)</li>
|
|
|
|
| 135 |
<li><b>UID:</b> the Bittensor UID of the miner</li>
|
| 136 |
<li><b>Weight:</b> the bittensor weight set for this model</li>
|
| 137 |
<li><b>Block:</b> the Bittensor block that the model was submitted in</li></ul><br/>More stats on <a href="https://taostats.io/subnets/netuid-37/" target="_blank">taostats</a>."""
|
utils.py
CHANGED
|
@@ -184,13 +184,6 @@ def get_scores(
|
|
| 184 |
uids (List[int]): List of UIDs to get scores for.
|
| 185 |
wandb_runs (List): List of validator runs from Wandb. Requires the runs are provided in descending order.
|
| 186 |
"""
|
| 187 |
-
|
| 188 |
-
def _maybe_convert_loss(loss: float, comp_id: int) -> float:
|
| 189 |
-
"""Converts loss to score for competitions that require it."""
|
| 190 |
-
if comp_id == 2:
|
| 191 |
-
return 1 - loss if loss else None
|
| 192 |
-
return loss
|
| 193 |
-
|
| 194 |
result = {}
|
| 195 |
previous_timestamp = None
|
| 196 |
seen_competitions = set()
|
|
@@ -216,9 +209,7 @@ def get_scores(
|
|
| 216 |
# Only the most recent run per competition is fresh.
|
| 217 |
is_fresh = comp_id not in seen_competitions
|
| 218 |
result[uid] = {
|
| 219 |
-
"avg_loss":
|
| 220 |
-
uid_data.get("average_loss", None), comp_id
|
| 221 |
-
),
|
| 222 |
"win_rate": uid_data.get("win_rate", None),
|
| 223 |
"win_total": uid_data.get("win_total", None),
|
| 224 |
"weight": uid_data.get("weight", None),
|
|
@@ -283,7 +274,8 @@ def get_losses_over_time(wandb_runs: List, competition_id: int) -> pd.DataFrame:
|
|
| 283 |
if c_id is None or c_id != competition_id:
|
| 284 |
continue
|
| 285 |
|
| 286 |
-
|
|
|
|
| 287 |
best_loss = loss
|
| 288 |
should_add_datapoint = True
|
| 289 |
# Now that we've processed the run's most recent steps, check if we should add a datapoint.
|
|
|
|
| 184 |
uids (List[int]): List of UIDs to get scores for.
|
| 185 |
wandb_runs (List): List of validator runs from Wandb. Requires the runs are provided in descending order.
|
| 186 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
result = {}
|
| 188 |
previous_timestamp = None
|
| 189 |
seen_competitions = set()
|
|
|
|
| 209 |
# Only the most recent run per competition is fresh.
|
| 210 |
is_fresh = comp_id not in seen_competitions
|
| 211 |
result[uid] = {
|
| 212 |
+
"avg_loss": uid_data.get("average_loss", None),
|
|
|
|
|
|
|
| 213 |
"win_rate": uid_data.get("win_rate", None),
|
| 214 |
"win_total": uid_data.get("win_total", None),
|
| 215 |
"weight": uid_data.get("weight", None),
|
|
|
|
| 274 |
if c_id is None or c_id != competition_id:
|
| 275 |
continue
|
| 276 |
|
| 277 |
+
# Filter out issue caused by wandb unavailability.
|
| 278 |
+
if loss < 0.99 and loss < best_loss:
|
| 279 |
best_loss = loss
|
| 280 |
should_add_datapoint = True
|
| 281 |
# Now that we've processed the run's most recent steps, check if we should add a datapoint.
|