Spaces:

macrocosm-os
/

finetuning-leaderboard

Runtime error

App Files Files Community

rusticluftig commited on Oct 28, 2024

Commit

838067a

1 Parent(s): 9b87de8

Update leaderboard for multi evals

Browse files

Files changed (2) hide show

app.py +13 -10
utils.py +3 -11

app.py CHANGED Viewed

@@ -79,11 +79,7 @@ def main():
             gr.HTML(EVALUATION_HEADER)
             show_stale = gr.Checkbox(label="Show Stale", interactive=True)
             competition_leaderboards = []
-            comp_2 = competitions.COMPETITION_DETAILS[2]
-            # Covert the losses into % of correct answers.
-            losses_2["losses"] = losses_2["losses"].apply(
-                lambda x: 1 - x if x else None
-            )
             with gr.Accordion(f"{comp_2.name} Competition"):
                 gr.HTML(comp_2.html_description)
                 competition_leaderboards.append(
@@ -94,7 +90,7 @@ def main():
                         headers=[
                             "Name",
                             "Win Rate",
-                            "MC Score",
                             "Weight",
                             "UID",
                             "Block",
@@ -117,18 +113,25 @@ def main():
                     x="timestamp",
                     x_title="Date",
                     y="losses",
-                    y_title="MC Score",
                     interactive=True,
                     visible=True,
                     width=1024,
-                    title="Best MC Score Over Time",
                 )
             gr.HTML(
                 """
                     <ul><li><b>Name:</b> the 🤗 Hugging Face repo (click to go to the model card)</li>
                     <li><b>Win Rate:</b> % of head-to-head evals won vs. other eval'd models, given an epsilon advantage or disadvantage</li>
-                    <li><b>Average Loss:</b> the last loss value on the evaluation data for the model as calculated by the OTF validator (lower is better)</li>
-                    <li><b>MC Score:</b> the % of correct multiple choice answers given by the model as calculated by the OTF validator (higher is better)</li>
                     <li><b>UID:</b> the Bittensor UID of the miner</li>
                     <li><b>Weight:</b> the bittensor weight set for this model</li>
                     <li><b>Block:</b> the Bittensor block that the model was submitted in</li></ul><br/>More stats on <a href="https://taostats.io/subnets/netuid-37/" target="_blank">taostats</a>."""

             gr.HTML(EVALUATION_HEADER)
             show_stale = gr.Checkbox(label="Show Stale", interactive=True)
             competition_leaderboards = []
+            comp_2 = competitions.COMPETITION_DETAILS[2]
             with gr.Accordion(f"{comp_2.name} Competition"):
                 gr.HTML(comp_2.html_description)
                 competition_leaderboards.append(
                         headers=[
                             "Name",
                             "Win Rate",
+                            "Score",
                             "Weight",
                             "UID",
                             "Block",
                     x="timestamp",
                     x_title="Date",
                     y="losses",
+                    y_title="Score",
                     interactive=True,
                     visible=True,
                     width=1024,
+                    title="Best Score Over Time",
+                )
+                gr.HTML(
+                    """
+                        The definition of score changes over time as new evaluation tasks are added in releases.
+                        <ul>
+                        <li><b>Start-Oct 27</b>: % of wrong answers on synthetic MMLU</li>
+                        <li><b>Oct 27-Now</b>: + word sorting eval</li>
+                        """
                 )
             gr.HTML(
                 """
                     <ul><li><b>Name:</b> the 🤗 Hugging Face repo (click to go to the model card)</li>
                     <li><b>Win Rate:</b> % of head-to-head evals won vs. other eval'd models, given an epsilon advantage or disadvantage</li>
+                    <li><b>Score:</b> the combined model score as calculated by the OTF validator (lower is better)</li>
                     <li><b>UID:</b> the Bittensor UID of the miner</li>
                     <li><b>Weight:</b> the bittensor weight set for this model</li>
                     <li><b>Block:</b> the Bittensor block that the model was submitted in</li></ul><br/>More stats on <a href="https://taostats.io/subnets/netuid-37/" target="_blank">taostats</a>."""

utils.py CHANGED Viewed

@@ -184,13 +184,6 @@ def get_scores(
         uids (List[int]): List of UIDs to get scores for.
         wandb_runs (List): List of validator runs from Wandb. Requires the runs are provided in descending order.
     """
-    def _maybe_convert_loss(loss: float, comp_id: int) -> float:
-        """Converts loss to score for competitions that require it."""
-        if comp_id == 2:
-            return 1 - loss if loss else None
-        return loss
     result = {}
     previous_timestamp = None
     seen_competitions = set()
@@ -216,9 +209,7 @@ def get_scores(
                 # Only the most recent run per competition is fresh.
                 is_fresh = comp_id not in seen_competitions
                 result[uid] = {
-                    "avg_loss": _maybe_convert_loss(
-                        uid_data.get("average_loss", None), comp_id
-                    ),
                     "win_rate": uid_data.get("win_rate", None),
                     "win_total": uid_data.get("win_total", None),
                     "weight": uid_data.get("weight", None),
@@ -283,7 +274,8 @@ def get_losses_over_time(wandb_runs: List, competition_id: int) -> pd.DataFrame:
                 if c_id is None or c_id != competition_id:
                     continue
-                if loss < best_loss:
                     best_loss = loss
                     should_add_datapoint = True
         # Now that we've processed the run's most recent steps, check if we should add a datapoint.

         uids (List[int]): List of UIDs to get scores for.
         wandb_runs (List): List of validator runs from Wandb. Requires the runs are provided in descending order.
     """
     result = {}
     previous_timestamp = None
     seen_competitions = set()
                 # Only the most recent run per competition is fresh.
                 is_fresh = comp_id not in seen_competitions
                 result[uid] = {
+                    "avg_loss": uid_data.get("average_loss", None),
                     "win_rate": uid_data.get("win_rate", None),
                     "win_total": uid_data.get("win_total", None),
                     "weight": uid_data.get("weight", None),
                 if c_id is None or c_id != competition_id:
                     continue
+                # Filter out issue caused by wandb unavailability.
+                if loss < 0.99 and loss < best_loss:
                     best_loss = loss
                     should_add_datapoint = True
         # Now that we've processed the run's most recent steps, check if we should add a datapoint.