{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import json\n", "from pathlib import Path\n", "\n", "import gradio as gr\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def get_leaderboard_df():\n", " filepaths = list(Path(\"eval_results\").rglob(\"*.json\"))\n", "\n", " # Parse filepaths to get unique models\n", " models = set()\n", " for filepath in filepaths:\n", " path_parts = Path(filepath).parts\n", " model_revision = \"_\".join(path_parts[1:4])\n", " models.add(model_revision)\n", "\n", " # Initialize DataFrame\n", " df = pd.DataFrame(index=list(models))\n", "\n", " # Extract data from each file and populate the DataFrame\n", " for filepath in filepaths:\n", " path_parts = Path(filepath).parts\n", " date = filepath.stem.split(\"_\")[-1][:-3].split(\"T\")[0]\n", " model_revision = \"_\".join(path_parts[1:4]) + \"_\" + date\n", " task = path_parts[4].capitalize()\n", " df.loc[model_revision, \"Date\"] = date\n", "\n", " with open(filepath, \"r\") as file:\n", " data = json.load(file)\n", " first_result_key = next(iter(data[\"results\"])) # gets the first key in 'results'\n", " # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard\n", " if task.lower() == \"truthfulqa\":\n", " value = data[\"results\"][first_result_key][\"truthfulqa_mc2\"]\n", " # IFEval has several metrics but we report just the prompt-loose-acc one\n", " elif task.lower() == \"ifeval\":\n", " value = data[\"results\"][first_result_key][\"prompt_level_loose_acc\"]\n", " # MMLU has several metrics but we report just the average one\n", " elif task.lower() == \"mmlu\":\n", " value = data[\"results\"][\"lighteval|mmlu:_average|5\"][\"acc\"]\n", " # HellaSwag and ARC reports acc_norm\n", " elif task.lower() in [\"hellaswag\", \"arc\"]:\n", " value = data[\"results\"][first_result_key][\"acc_norm\"]\n", " else:\n", " first_metric_key = next(\n", " iter(data[\"results\"][first_result_key])\n", " ) # gets the first key in the first result\n", " value = data[\"results\"][first_result_key][first_metric_key] # gets the value of the first metric\n", " df.loc[model_revision, task] = value\n", "\n", " # Put IFEval in first column\n", " ifeval_col = df.pop(\"Ifeval\")\n", " df.insert(1, \"Ifeval\", ifeval_col)\n", " # Drop rows where every entry is NaN\n", " df = df.dropna(how=\"all\", axis=0, subset=[c for c in df.columns if c != \"Date\"])\n", " df.insert(loc=1, column=\"Average\", value=df.mean(axis=1, numeric_only=True))\n", " # Convert all values to percentage\n", " df[df.select_dtypes(include=[\"number\"]).columns] *= 100.0\n", " df = df.sort_values(by=[\"Average\"], ascending=False)\n", " df = df.reset_index().rename(columns={\"index\": \"Model\"}).round(2)\n", " # Strip off date from model name\n", " df[\"Model\"] = df[\"Model\"].apply(lambda x: x.rsplit(\"_\", 1)[0])\n", " return df" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "df = get_leaderboard_df()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | Model\n", " | Date\n", " | Average\n", " | Ifeval\n", " | Truthfulqa\n", " | Winogrande\n", " | Gsm8k\n", " | Mmlu\n", " | Hellaswag\n", " | Arc\n", " | 
|---|---|---|---|---|---|---|---|---|---|---|
| 0\n", " | NousResearch_Nous-Hermes-2-Yi-34B_main\n", " | 2024-03-04\n", " | 74.01\n", " | NaN\n", " | 61.44\n", " | 80.58\n", " | NaN\n", " | 76.24\n", " | 83.79\n", " | 68.00\n", " | 
| 1\n", " | deepseek-ai_deepseek-llm-67b-chat_main\n", " | 2024-03-05\n", " | 71.62\n", " | 55.27\n", " | NaN\n", " | NaN\n", " | 76.12\n", " | 71.18\n", " | 83.94\n", " | NaN\n", " | 
| 2\n", " | NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO_main\n", " | 2024-03-02\n", " | 70.43\n", " | 59.33\n", " | 64.76\n", " | 78.53\n", " | 62.17\n", " | 71.96\n", " | 85.42\n", " | 70.82\n", " | 
| 3\n", " | mistralai_Mixtral-8x7B-Instruct-v0.1_main\n", " | 2024-03-02\n", " | 69.80\n", " | 55.08\n", " | 70.79\n", " | 73.56\n", " | 59.89\n", " | 70.60\n", " | 86.68\n", " | 72.01\n", " | 
| 4\n", " | deepseek-ai_deepseek-llm-67b-chat_main\n", " | 2024-03-04\n", " | 67.03\n", " | NaN\n", " | 57.78\n", " | 79.16\n", " | NaN\n", " | NaN\n", " | NaN\n", " | 64.16\n", " | 
| ...\n", " | ...\n", " | ...\n", " | ...\n", " | ...\n", " | ...\n", " | ...\n", " | ...\n", " | ...\n", " | ...\n", " | ...\n", " | 
| 269\n", " | HuggingFaceH4_starcoder2-15b-ift_v18.0\n", " | 2024-03-10\n", " | 11.23\n", " | 21.63\n", " | NaN\n", " | NaN\n", " | 0.83\n", " | NaN\n", " | NaN\n", " | NaN\n", " | 
| 270\n", " | HuggingFaceH4_mistral-7b-ift_v49.0\n", " | 2024-03-07\n", " | 10.07\n", " | 20.15\n", " | NaN\n", " | NaN\n", " | 0.00\n", " | NaN\n", " | NaN\n", " | NaN\n", " | 
| 271\n", " | HuggingFaceH4_starchat-beta_main\n", " | 2024-03-12\n", " | 8.13\n", " | 8.13\n", " | NaN\n", " | NaN\n", " | NaN\n", " | NaN\n", " | NaN\n", " | NaN\n", " | 
| 272\n", " | HuggingFaceH4_starcoder2-15b-ift_v7.0\n", " | 2024-03-10\n", " | 7.88\n", " | 12.57\n", " | NaN\n", " | NaN\n", " | 3.18\n", " | NaN\n", " | NaN\n", " | NaN\n", " | 
| 273\n", " | HuggingFaceH4_zephyr-7b-beta-ift_v1.1\n", " | 2024-03-13\n", " | 4.71\n", " | 9.43\n", " | NaN\n", " | NaN\n", " | 0.00\n", " | NaN\n", " | NaN\n", " | NaN\n", " | 
274 rows × 10 columns
\n", "| \n", " | Model\n", " | Ifeval\n", " | Truthfulqa\n", " | Winogrande\n", " | Gsm8k\n", " | Mmlu\n", " | Hellaswag\n", " | Arc\n", " | 
|---|---|---|---|---|---|---|---|---|
| 0\n", " | HuggingFaceH4_mistral-7b-ift_v41.0\n", " | 44.36\n", " | 49.35\n", " | 72.93\n", " | 37.30\n", " | 60.82\n", " | 79.70\n", " | 58.36\n", " | 
| 1\n", " | HuggingFaceH4_mistral-7b-ift_v41.1\n", " | 47.32\n", " | 47.89\n", " | 72.69\n", " | 36.32\n", " | 60.34\n", " | 79.57\n", " | 57.51\n", " | 
| 2\n", " | HuggingFaceH4_mistral-7b-ift_v41.10\n", " | 32.72\n", " | 51.05\n", " | 72.45\n", " | 25.93\n", " | 59.75\n", " | 81.92\n", " | 59.22\n", " | 
| 3\n", " | HuggingFaceH4_mistral-7b-ift_v41.11\n", " | 37.89\n", " | 51.05\n", " | 64.56\n", " | 17.59\n", " | 57.60\n", " | 77.65\n", " | 55.89\n", " | 
| 4\n", " | HuggingFaceH4_mistral-7b-ift_v41.12\n", " | 37.89\n", " | 45.94\n", " | 63.30\n", " | 21.15\n", " | 58.50\n", " | 74.94\n", " | 52.73\n", " | 
| ...\n", " | ...\n", " | ...\n", " | ...\n", " | ...\n", " | ...\n", " | ...\n", " | ...\n", " | ...\n", " | 
| 258\n", " | mistralai_Mistral-7B-Instruct-v0.2_main\n", " | 53.97\n", " | 70.68\n", " | 68.82\n", " | 38.13\n", " | 59.43\n", " | 83.45\n", " | 65.70\n", " | 
| 259\n", " | mistralai_Mixtral-8x7B-Instruct-v0.1_main\n", " | 55.08\n", " | 70.79\n", " | 73.56\n", " | 59.89\n", " | 70.60\n", " | 86.68\n", " | 72.01\n", " | 
| 260\n", " | openchat_openchat-3.5-0106_main\n", " | 54.71\n", " | 57.55\n", " | 72.53\n", " | 66.19\n", " | 63.72\n", " | 80.10\n", " | 61.01\n", " | 
| 261\n", " | stabilityai_stablelm-zephyr-3b_main\n", " | 34.75\n", " | 46.19\n", " | 58.41\n", " | 40.18\n", " | 45.18\n", " | 71.57\n", " | 45.82\n", " | 
| 262\n", " | teknium_OpenHermes-2.5-Mistral-7B_main\n", " | 52.68\n", " | 58.62\n", " | 72.14\n", " | 54.06\n", " | 63.01\n", " | 82.34\n", " | 62.97\n", " | 
263 rows × 8 columns
\n", "| \n", " | Model\n", " | Date\n", " | Ifeval\n", " | Truthfulqa\n", " | Winogrande\n", " | Gsm8k\n", " | Mmlu\n", " | Hellaswag\n", " | Arc\n", " | 
|---|---|---|---|---|---|---|---|---|---|
| 0\n", " | NousResearch_Nous-Hermes-2-Yi-34B_main\n", " | 2024-03-04\n", " | 39.00\n", " | 61.44\n", " | 80.58\n", " | 67.93\n", " | 76.24\n", " | 83.79\n", " | 68.00\n", " | 
| 1\n", " | deepseek-ai_deepseek-llm-67b-chat_main\n", " | 2024-03-05\n", " | 55.27\n", " | 57.78\n", " | 79.16\n", " | 76.12\n", " | 71.18\n", " | 83.94\n", " | 64.16\n", " | 
| 2\n", " | NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO_main\n", " | 2024-03-02\n", " | 59.33\n", " | 64.76\n", " | 78.53\n", " | 62.17\n", " | 71.96\n", " | 85.42\n", " | 70.82\n", " | 
| 3\n", " | mistralai_Mixtral-8x7B-Instruct-v0.1_main\n", " | 2024-03-02\n", " | 55.08\n", " | 70.79\n", " | 73.56\n", " | 59.89\n", " | 70.60\n", " | 86.68\n", " | 72.01\n", " | 
| 4\n", " | deepseek-ai_deepseek-llm-67b-chat_main\n", " | 2024-03-04\n", " | 55.27\n", " | 57.78\n", " | 79.16\n", " | 76.12\n", " | 71.18\n", " | 83.94\n", " | 64.16\n", " | 
| ...\n", " | ...\n", " | ...\n", " | ...\n", " | ...\n", " | ...\n", " | ...\n", " | ...\n", " | ...\n", " | ...\n", " | 
| 269\n", " | HuggingFaceH4_starcoder2-15b-ift_v18.0\n", " | 2024-03-10\n", " | 21.63\n", " | NaN\n", " | NaN\n", " | 0.83\n", " | NaN\n", " | NaN\n", " | NaN\n", " | 
| 270\n", " | HuggingFaceH4_mistral-7b-ift_v49.0\n", " | 2024-03-07\n", " | 20.15\n", " | NaN\n", " | NaN\n", " | 0.00\n", " | NaN\n", " | NaN\n", " | NaN\n", " | 
| 271\n", " | HuggingFaceH4_starchat-beta_main\n", " | 2024-03-12\n", " | 8.13\n", " | NaN\n", " | NaN\n", " | NaN\n", " | NaN\n", " | NaN\n", " | NaN\n", " | 
| 272\n", " | HuggingFaceH4_starcoder2-15b-ift_v7.0\n", " | 2024-03-10\n", " | 12.57\n", " | NaN\n", " | NaN\n", " | 3.18\n", " | NaN\n", " | NaN\n", " | NaN\n", " | 
| 273\n", " | HuggingFaceH4_zephyr-7b-beta-ift_v1.1\n", " | 2024-03-13\n", " | 9.43\n", " | NaN\n", " | NaN\n", " | 0.00\n", " | NaN\n", " | NaN\n", " | NaN\n", " | 
274 rows × 9 columns
\n", "