{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "from pathlib import Path\n",
    "\n",
    "import gradio as gr\n",
    "import pandas as pd\n",
    "\n",
    "TITLE = \"\"\"
LLM Leaderboard for H4 Models
\"\"\"\n",
    "\n",
    "DESCRIPTION = f\"\"\"\n",
    "Evaluation of H4 and community models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.\n",
    "\"\"\"\n",
    "\n",
    "BENCHMARKS_TO_SKIP = [\"math\", \"mini_math\"]\n",
    "\n",
    "\n",
    "def get_leaderboard_df(agg : str = \"max\"):\n",
    "    filepaths = list(Path(\"eval_results\").rglob(\"*.json\"))\n",
    "\n",
    "    # Parse filepaths to get unique models\n",
    "    models = set()\n",
    "    for filepath in filepaths:\n",
    "        path_parts = Path(filepath).parts\n",
    "        model_revision = \"_\".join(path_parts[1:4])\n",
    "        models.add(model_revision)\n",
    "\n",
    "    # Initialize DataFrame\n",
    "    df = pd.DataFrame(index=list(models))\n",
    "\n",
    "    # Extract data from each file and populate the DataFrame\n",
    "    for filepath in filepaths:\n",
    "        path_parts = Path(filepath).parts\n",
    "        date = filepath.stem.split(\"_\")[-1][:-3]\n",
    "        model_revision = \"_\".join(path_parts[1:4]) + \"_\" + date\n",
    "        task = path_parts[4]\n",
    "        df.loc[model_revision, \"Date\"] = date\n",
    "\n",
    "        with open(filepath, \"r\") as file:\n",
    "            data = json.load(file)\n",
    "            first_result_key = next(iter(data[\"results\"]))  # gets the first key in 'results'\n",
    "            # Skip benchmarks that we don't want to include in the leaderboard\n",
    "            if task.lower() in BENCHMARKS_TO_SKIP:\n",
    "                continue\n",
    "            # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard\n",
    "            if task.lower() == \"truthfulqa\":\n",
    "                value = data[\"results\"][first_result_key][\"truthfulqa_mc2\"]\n",
    "            # IFEval has several metrics but we report just the prompt-loose-acc one\n",
    "            elif task.lower() == \"ifeval\":\n",
    "                value = data[\"results\"][first_result_key][\"prompt_level_loose_acc\"]\n",
    "            # MMLU has several metrics but we report just the average one\n",
    "            elif task.lower() == \"mmlu\":\n",
    "                value = [v[\"acc\"] for k, v in data[\"results\"].items() if \"_average\" in k.lower()][0]\n",
    "            # HellaSwag and ARC reports acc_norm\n",
    "            elif task.lower() in [\"hellaswag\", \"arc\"]:\n",
    "                value = data[\"results\"][first_result_key][\"acc_norm\"]\n",
    "            # BBH has several metrics but we report just the average one\n",
    "            elif task.lower() == \"bbh\":\n",
    "                if \"all\" in data[\"results\"]:\n",
    "                    value = data[\"results\"][\"all\"][\"acc\"]\n",
    "                else:\n",
    "                    value = -100\n",
    "            # AGIEval reports acc_norm\n",
    "            elif task.lower() == \"agieval\":\n",
    "                value = data[\"results\"][\"all\"][\"acc_norm\"]\n",
    "            # MATH reports qem\n",
    "            elif task.lower() in [\"math\", \"math_v2\", \"aimo_kaggle\"]:\n",
    "                value = data[\"results\"][\"all\"][\"qem\"]\n",
    "            else:\n",
    "                first_metric_key = next(\n",
    "                    iter(data[\"results\"][first_result_key])\n",
    "                )  # gets the first key in the first result\n",
    "                value = data[\"results\"][first_result_key][first_metric_key]  # gets the value of the first metric\n",
    "\n",
    "            # For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe\n",
    "            if task.lower() in [\"mini_math_v2\"]:\n",
    "                for k, v in data[\"results\"].items():\n",
    "                    if k != \"all\":\n",
    "                        level = k.split(\"|\")[1].split(\":\")[-1]\n",
    "                        value = v[\"qem\"]\n",
    "                        df.loc[model_revision, f\"{task}_{level}\"] = value\n",
    "            # For kaggle_pot we report N metrics, one for each prompt and store each one as a separate row in the dataframe\n",
    "            elif task.lower() in [\"aimo_kaggle_medium_pot\"]:\n",
    "                for k, v in data[\"results\"].items():\n",
    "                    if k != \"all\" and \"_average\" not in k:\n",
    "                        version = k.split(\"|\")[1].split(\":\")[-1]\n",
    "                        value = v[\"qem\"] if \"qem\" in v else v[\"score\"]\n",
    "                        df.loc[model_revision, f\"{task}_{version}\"] = value\n",
    "            # For kaggle_pot we report N metrics, one for each prompt and store each one as a separate row in the dataframe\n",
    "            elif task.lower() in [\"aimo_kaggle_hard_pot\"]:\n",
    "                for k, v in data[\"results\"].items():\n",
    "                    if k != \"all\" and \"_average\" not in k:\n",
    "                        version = k.split(\"|\")[1].split(\":\")[-1]\n",
    "                        value = v[\"qem\"] if \"qem\" in v else v[\"score\"]\n",
    "                        df.loc[model_revision, f\"{task}_{version}\"] = value\n",
    "            # For kaggle_tora we report accuracy, so need  to divide by 100\n",
    "            elif task.lower() in [\n",
    "                \"aimo_tora_eval_kaggle_medium\",\n",
    "                \"aimo_tora_eval_kaggle_hard\",\n",
    "                \"aimo_kaggle_fast_eval_hard\",\n",
    "                \"aimo_kaggle_tora_medium\",\n",
    "                \"aimo_kaggle_tora_hard\",\n",
    "                \"aimo_kaggle_tora_medium_extended\",\n",
    "                \"aimo_kaggle_tora_hard_extended\",\n",
    "            ]:\n",
    "                for k, v in data[\"results\"].items():\n",
    "                    value = float(v[\"qem\"]) / 100.0\n",
    "                    df.loc[model_revision, f\"{task}\"] = value\n",
    "            # For AlpacaEval we report base winrate and lenght corrected one\n",
    "            elif task.lower() == \"alpaca_eval\":\n",
    "                value = data[\"results\"][first_result_key][\"win_rate\"]\n",
    "                df.loc[model_revision, \"Alpaca_eval\"] = value / 100.0\n",
    "                value = data[\"results\"][first_result_key][\"length_controlled_winrate\"]\n",
    "                df.loc[model_revision, \"Alpaca_eval_lc\"] = value / 100.0\n",
    "            else:\n",
    "                df.loc[model_revision, task] = float(value)\n",
    "\n",
    "    # Drop rows where every entry is NaN\n",
    "    df = df.dropna(how=\"all\", axis=0, subset=[c for c in df.columns if c != \"Date\"])\n",
    "\n",
    "    # Trim minimath column names\n",
    "    df.columns = [c.replace(\"_level_\", \"_l\") for c in df.columns]\n",
    "\n",
    "    # Trim AIMO column names\n",
    "    df.columns = [c.replace(\"aimo_\", \"\") for c in df.columns]\n",
    "\n",
    "    df.insert(loc=0, column=\"Average\", value=df.mean(axis=1, numeric_only=True))\n",
    "\n",
    "    # Convert all values to percentage\n",
    "    df[df.select_dtypes(include=[\"number\"]).columns] *= 100.0\n",
    "    df = df.sort_values(by=[\"Average\"], ascending=False)\n",
    "    df = df.reset_index().rename(columns={\"index\": \"Model\"}).round(2)\n",
    "    # Strip off date from model name\n",
    "    df[\"Model\"] = df[\"Model\"].apply(lambda x: x.rsplit(\"_\", 1)[0])\n",
    "\n",
    "    # Drop date and aggregate results by model name\n",
    "    df = df.drop(\"Date\", axis=1).groupby(\"Model\").agg(agg).reset_index()\n",
    "\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = get_leaderboard_df(agg='mean')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "\n",
       "
\n",
       "  \n",
       "    \n",
       "      | \n",
       " | Model\n",
       " | Average\n",
       " | kaggle_tora_medium_extended\n",
       " | kaggle_tora_hard_extended\n",
       " | 
\n",
       "  \n",
       "  \n",
       "    \n",
       "      | 1741\n",
       " | AI-MO_deepseek-math-7b-sft_aimo_v38.15.gptq-8bits\n",
       " | 28.89\n",
       " | 61.45\n",
       " | 28.89\n",
       " | 
\n",
       "  \n",
       "
\n",
       "
\n",
       "\n",
       "
\n",
       "  \n",
       "    \n",
       "      | \n",
       " | Model\n",
       " | Average\n",
       " | kaggle_tora_medium_extended\n",
       " | kaggle_tora_hard_extended\n",
       " | 
\n",
       "  \n",
       "  \n",
       "    \n",
       "      | 1741\n",
       " | AI-MO_deepseek-math-7b-sft_aimo_v38.15.gptq-8bits\n",
       " | 65.06\n",
       " | 65.06\n",
       " | 32.22\n",
       " | 
\n",
       "  \n",
       "
\n",
       "
\n",
       "\n",
       "
\n",
       "  \n",
       "    \n",
       "      | \n",
       " | Model\n",
       " | Date\n",
       " | Ifeval\n",
       " | Truthfulqa\n",
       " | Winogrande\n",
       " | Gsm8k\n",
       " | Mmlu\n",
       " | Hellaswag\n",
       " | Arc\n",
       " | 
\n",
       "  \n",
       "  \n",
       "    \n",
       "      | 0\n",
       " | NousResearch_Nous-Hermes-2-Yi-34B_main\n",
       " | 2024-03-04\n",
       " | 39.00\n",
       " | 61.44\n",
       " | 80.58\n",
       " | 67.93\n",
       " | 76.24\n",
       " | 83.79\n",
       " | 68.00\n",
       " | 
\n",
       "    \n",
       "      | 1\n",
       " | deepseek-ai_deepseek-llm-67b-chat_main\n",
       " | 2024-03-05\n",
       " | 55.27\n",
       " | 57.78\n",
       " | 79.16\n",
       " | 76.12\n",
       " | 71.18\n",
       " | 83.94\n",
       " | 64.16\n",
       " | 
\n",
       "    \n",
       "      | 2\n",
       " | NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO_main\n",
       " | 2024-03-02\n",
       " | 59.33\n",
       " | 64.76\n",
       " | 78.53\n",
       " | 62.17\n",
       " | 71.96\n",
       " | 85.42\n",
       " | 70.82\n",
       " | 
\n",
       "    \n",
       "      | 3\n",
       " | mistralai_Mixtral-8x7B-Instruct-v0.1_main\n",
       " | 2024-03-02\n",
       " | 55.08\n",
       " | 70.79\n",
       " | 73.56\n",
       " | 59.89\n",
       " | 70.60\n",
       " | 86.68\n",
       " | 72.01\n",
       " | 
\n",
       "    \n",
       "      | 4\n",
       " | deepseek-ai_deepseek-llm-67b-chat_main\n",
       " | 2024-03-04\n",
       " | 55.27\n",
       " | 57.78\n",
       " | 79.16\n",
       " | 76.12\n",
       " | 71.18\n",
       " | 83.94\n",
       " | 64.16\n",
       " | 
\n",
       "    \n",
       "      | ...\n",
       " | ...\n",
       " | ...\n",
       " | ...\n",
       " | ...\n",
       " | ...\n",
       " | ...\n",
       " | ...\n",
       " | ...\n",
       " | ...\n",
       " | 
\n",
       "    \n",
       "      | 269\n",
       " | HuggingFaceH4_starcoder2-15b-ift_v18.0\n",
       " | 2024-03-10\n",
       " | 21.63\n",
       " | NaN\n",
       " | NaN\n",
       " | 0.83\n",
       " | NaN\n",
       " | NaN\n",
       " | NaN\n",
       " | 
\n",
       "    \n",
       "      | 270\n",
       " | HuggingFaceH4_mistral-7b-ift_v49.0\n",
       " | 2024-03-07\n",
       " | 20.15\n",
       " | NaN\n",
       " | NaN\n",
       " | 0.00\n",
       " | NaN\n",
       " | NaN\n",
       " | NaN\n",
       " | 
\n",
       "    \n",
       "      | 271\n",
       " | HuggingFaceH4_starchat-beta_main\n",
       " | 2024-03-12\n",
       " | 8.13\n",
       " | NaN\n",
       " | NaN\n",
       " | NaN\n",
       " | NaN\n",
       " | NaN\n",
       " | NaN\n",
       " | 
\n",
       "    \n",
       "      | 272\n",
       " | HuggingFaceH4_starcoder2-15b-ift_v7.0\n",
       " | 2024-03-10\n",
       " | 12.57\n",
       " | NaN\n",
       " | NaN\n",
       " | 3.18\n",
       " | NaN\n",
       " | NaN\n",
       " | NaN\n",
       " | 
\n",
       "    \n",
       "      | 273\n",
       " | HuggingFaceH4_zephyr-7b-beta-ift_v1.1\n",
       " | 2024-03-13\n",
       " | 9.43\n",
       " | NaN\n",
       " | NaN\n",
       " | 0.00\n",
       " | NaN\n",
       " | NaN\n",
       " | NaN\n",
       " | 
\n",
       "  \n",
       "
\n",
       "
274 rows × 9 columns
\n",
       "