diff --git a/README.md b/README.md
old mode 100644
new mode 100755
index 913de00581edbc7ba6478a568f9c21e89e09e59d..8e470f09733b99cf459d011359c0f0911e659d4d
--- a/README.md
+++ b/README.md
@@ -1,13 +1,17 @@
 ---
 title: Self Improving Leaderboard
-emoji: 🦀
-colorFrom: purple
-colorTo: green
+emoji: 🔄
+colorFrom: green
+colorTo: indigo
 sdk: gradio
-sdk_version: 4.40.0
+sdk_version: 4.36.0
 app_file: app.py
-pinned: false
+pinned: true
 license: apache-2.0
+duplicated_from: upstage/open-ko-llm-leaderboard
+fullWidth: true
+tags:
+  - leaderboard
 ---
 
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
diff --git a/app.py b/app.py
new file mode 100755
index 0000000000000000000000000000000000000000..0c3250565871e6c16d5e80310a91cda8f8ec9e37
--- /dev/null
+++ b/app.py
@@ -0,0 +1,400 @@
+import gradio as gr
+import pandas as pd
+from apscheduler.schedulers.background import BackgroundScheduler
+from huggingface_hub import snapshot_download
+from gradio_space_ci import configure_space_ci # FOR CI
+
+from src.display.about import (
+    EVALUATION_QUEUE_TEXT,
+    INTRODUCTION_TEXT,
+    LLM_BENCHMARKS_TEXT,
+    TITLE,
+)
+from src.display.css_html_js import custom_css
+from src.display.utils import (
+    BENCHMARK_COLS,
+    COLS,
+    EVAL_COLS,
+    EVAL_TYPES,
+    NUMERIC_INTERVALS,
+    TYPES,
+    AutoEvalColumn,
+    ModelType,
+    fields,
+    WeightType,
+    Precision
+)
+from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
+from src.populate import get_evaluation_queue_df, get_leaderboard_df
+from src.submission.submit import add_new_eval
+from src.tools.collections import update_collections
+from src.tools.plots import (
+    create_metric_plot_obj,
+    create_plot_df,
+    create_scores_df,
+)
+
+
+def restart_space():
+    API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
+
+try:
+    print(EVAL_REQUESTS_PATH)
+    snapshot_download(
+        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
+    )
+except Exception:
+    restart_space()
+try:
+    print(EVAL_RESULTS_PATH)
+    snapshot_download(
+        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
+    )
+except Exception:
+    restart_space()
+
+
+_, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
+leaderboard_df = original_df.copy()
+
+(
+    finished_eval_queue_df,
+    running_eval_queue_df,
+    pending_eval_queue_df,
+    failed_eval_queue_df,
+) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
+
+
+# Searching and filtering
+def update_table(
+    hidden_df: pd.DataFrame,
+    columns: list,
+    type_query: list,
+    precision_query: str,
+    size_query: list,
+    show_deleted: bool,
+    show_merges: bool,
+    show_flagged: bool,
+    query: str,
+):
+    filtered_df = filter_models(hidden_df, type_query, size_query, precision_query)
+    filtered_df = filter_queries(query, filtered_df)
+    df = select_columns(filtered_df, columns)
+    return df
+
+def quarter_update_table(
+    hidden_df: pd.DataFrame,
+    columns: list,
+    type_query: list,
+    precision_query: str,
+    size_query: list,
+    show_deleted: bool,
+    show_merges: bool,
+    show_flagged: bool,
+    query: str,
+):
+    filtered_df = filter_models(hidden_df, type_query, size_query, precision_query)
+    filtered_df = filter_queries(query, filtered_df)
+    df = quarter_select_columns(filtered_df, columns)
+    return df
+
+
+def load_query(request: gr.Request):  # triggered only once at startup => read query parameter if it exists
+    query = request.query_params.get("query") or ""
+    return query, query # return one for the "search_bar", one for a hidden component that triggers a reload only if value has changed
+
+
+def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
+    return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
+
+
+def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
+    always_here_cols = [
+        AutoEvalColumn.model_type_symbol.name,
+        AutoEvalColumn.model.name,
+    ]
+    # We use COLS to maintain sorting
+    filtered_df = df[
+        always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
+    ]
+    return filtered_df
+
+
+def filter_queries(query: str, filtered_df: pd.DataFrame):
+    """Added by Abishek"""
+    final_df = []
+    if query != "":
+        queries = [q.strip() for q in query.split(";")]
+        for _q in queries:
+            _q = _q.strip()
+            if _q != "":
+                temp_filtered_df = search_table(filtered_df, _q)
+                if len(temp_filtered_df) > 0:
+                    final_df.append(temp_filtered_df)
+        if len(final_df) > 0:
+            filtered_df = pd.concat(final_df)
+            filtered_df = filtered_df.drop_duplicates(
+                subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
+            )
+
+    return filtered_df
+
+
+def filter_models(
+    df: pd.DataFrame, type_query: list, size_query: list, precision_query: list
+) -> pd.DataFrame:
+
+    type_emoji = [t[0] for t in type_query]
+    df = df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
+    df = df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
+    numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
+    params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
+    mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
+    df = df.loc[mask]
+
+    return df
+
+
+leaderboard_df = filter_models(leaderboard_df, [t.to_str(" : ") for t in ModelType], list(NUMERIC_INTERVALS.keys()), [i.value.name for i in Precision])
+
+print(leaderboard_df)
+
+demo = gr.Blocks(css=custom_css)
+with demo:
+    gr.HTML(TITLE)
+    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+
+    with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("🔄 Self-Improving Benchmark", elem_id="llm-benchmark-tab-table", id=0):
+            with gr.Row():
+                with gr.Column():
+                    with gr.Row():
+                        search_bar = gr.Textbox(
+                            placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
+                            show_label=False,
+                            elem_id="search-bar",
+                        )
+                    with gr.Row():
+                        shown_columns = gr.CheckboxGroup(
+                            choices=[
+                                c.name
+                                for c in fields(AutoEvalColumn)
+                                if not c.hidden and not c.never_hidden and not c.dummy
+                            ],
+                            value=[
+                                c.name
+                                for c in fields(AutoEvalColumn)
+                                if c.displayed_by_default and not c.hidden and not c.never_hidden
+                            ],
+                            label="Select columns to show",
+                            elem_id="column-select",
+                            interactive=True,
+                        )
+
+                with gr.Column(min_width=320):
+                    #with gr.Box(elem_id="box-filter"):
+                    filter_columns_type = gr.CheckboxGroup(
+                        label="Model types",
+                        choices=[t.to_str() for t in ModelType],
+                        value=[t.to_str() for t in ModelType],
+                        interactive=True,
+                        elem_id="filter-columns-type",
+                    )
+                    filter_columns_precision = gr.CheckboxGroup(
+                        label="Precision",
+                        choices=[i.value.name for i in Precision],
+                        value=[i.value.name for i in Precision],
+                        interactive=True,
+                        elem_id="filter-columns-precision",
+                    )
+                    filter_columns_size = gr.CheckboxGroup(
+                        label="Model sizes (in billions of parameters)",
+                        choices=list(NUMERIC_INTERVALS.keys()),
+                        value=list(NUMERIC_INTERVALS.keys()),
+                        interactive=True,
+                        elem_id="filter-columns-size",
+                    )
+
+            leaderboard_table = gr.components.Dataframe(
+                value=leaderboard_df[
+                    [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
+                    + shown_columns.value
+                    + [AutoEvalColumn.dummy.name]
+                ],
+                headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
+                datatype=TYPES,
+                elem_id="leaderboard-table",
+                interactive=False,
+                visible=True,
+                #column_widths=["2%", "33%"] 
+            )
+
+            # Dummy leaderboard for handling the case when the user uses backspace key
+            hidden_leaderboard_table_for_search = gr.components.Dataframe(
+                value=original_df[COLS],
+                headers=COLS,
+                datatype=TYPES,
+                visible=False,
+            )
+            search_bar.submit(
+                update_table,
+                [
+                    hidden_leaderboard_table_for_search,
+                    shown_columns,
+                    filter_columns_type,
+                    filter_columns_precision,
+                    filter_columns_size,
+                    search_bar,
+                ],
+                leaderboard_table,
+            )
+
+            # Define a hidden component that will trigger a reload only if a query parameter has be set
+            hidden_search_bar = gr.Textbox(value="", visible=False)
+            hidden_search_bar.change(
+                update_table,
+                [
+                    hidden_leaderboard_table_for_search,
+                    shown_columns,
+                    filter_columns_type,
+                    filter_columns_precision,
+                    filter_columns_size,
+                    search_bar,
+                ],
+                leaderboard_table,
+            )
+            # Check query parameter once at startup and update search bar + hidden component
+            demo.load(load_query, inputs=[], outputs=[search_bar, hidden_search_bar])
+            
+            for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size]:
+                selector.change(
+                    update_table,
+                    [
+                        hidden_leaderboard_table_for_search,
+                        shown_columns,
+                        filter_columns_type,
+                        filter_columns_precision,
+                        filter_columns_size,
+                        search_bar,
+                    ],
+                    leaderboard_table,
+                    queue=True,
+                )
+
+        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=1):
+            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+
+        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=2):
+            with gr.Column():
+                with gr.Row():
+                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
+
+                with gr.Column():
+                    with gr.Accordion(
+                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
+                        open=False,
+                    ):
+                        with gr.Row():
+                            finished_eval_table = gr.components.Dataframe(
+                                value=finished_eval_queue_df,
+                                headers=EVAL_COLS,
+                                datatype=EVAL_TYPES,
+                                row_count=5,
+                            )
+                    with gr.Accordion(
+                        f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
+                        open=False,
+                    ):
+                        with gr.Row():
+                            running_eval_table = gr.components.Dataframe(
+                                value=running_eval_queue_df,
+                                headers=EVAL_COLS,
+                                datatype=EVAL_TYPES,
+                                row_count=5,
+                            )
+
+                    with gr.Accordion(
+                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
+                        open=False,
+                    ):
+                        with gr.Row():
+                            pending_eval_table = gr.components.Dataframe(
+                                value=pending_eval_queue_df,
+                                headers=EVAL_COLS,
+                                datatype=EVAL_TYPES,
+                                row_count=5,
+                            )
+                    with gr.Accordion(
+                        f"❌ Failed Evaluations ({len(failed_eval_queue_df)})",
+                        open=False,
+                    ):
+                        with gr.Row():
+                            pending_eval_table = gr.components.Dataframe(
+                                value=failed_eval_queue_df,
+                                headers=EVAL_COLS,
+                                datatype=EVAL_TYPES,
+                                row_count=5,
+                            )
+            with gr.Row():
+                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
+
+            with gr.Row():
+                with gr.Column():
+                    model_name_textbox = gr.Textbox(label="Model name")
+                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
+                    private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
+                    model_type = gr.Dropdown(
+                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
+                        label="Model type",
+                        multiselect=False,
+                        value=ModelType.IFT.to_str(" : "),
+                        interactive=True,
+                    )
+
+                with gr.Column():
+                    precision = gr.Dropdown(
+                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
+                        label="Precision",
+                        multiselect=False,
+                        value="float16",
+                        interactive=True,
+                    )
+                    weight_type = gr.Dropdown(
+                        choices=[i.value.name for i in WeightType],
+                        label="Weights type",
+                        multiselect=False,
+                        value="Original",
+                        interactive=True,
+                    )
+                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
+
+            submit_button = gr.Button("Submit Evalulation!")
+            submission_result = gr.Markdown()
+            submit_button.click(
+                add_new_eval,
+                [
+                    model_name_textbox,
+                    base_model_name_textbox,
+                    revision_name_textbox,
+                    precision,
+                    private,
+                    weight_type,
+                    model_type,
+                ],
+                submission_result,
+            )
+
+scheduler = BackgroundScheduler()
+scheduler.add_job(restart_space, "interval", seconds=1800)
+scheduler.start()
+
+# Both launches the space and its CI
+configure_space_ci(
+    demo.queue(default_concurrency_limit=40),
+    trusted_authors=[],  # add manually trusted authors
+    private="True",  # ephemeral spaces will have same visibility as the main space. Otherwise, set to `True` or `False` explicitly.
+    variables={},  # We overwrite HF_HOME as tmp CI spaces will have no cache 
+    secrets=["HF_TOKEN", "H4_TOKEN"],  # which secret do I want to copy from the main space? Can be a `List[str]`.
+    hardware=None,  # "cpu-basic" by default. Otherwise set to "auto" to have same hardware as the main space or any valid string value.
+    storage=None,  # no storage by default. Otherwise set to "auto" to have same storage as the main space or any valid string value.
+).launch()
\ No newline at end of file
diff --git a/eval-queue/.gitattributes b/eval-queue/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..28df5f900b358436f0267334b3e3e9af33f917ba
--- /dev/null
+++ b/eval-queue/.gitattributes
@@ -0,0 +1,55 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.lz4 filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+# Audio files - uncompressed
+*.pcm filter=lfs diff=lfs merge=lfs -text
+*.sam filter=lfs diff=lfs merge=lfs -text
+*.raw filter=lfs diff=lfs merge=lfs -text
+# Audio files - compressed
+*.aac filter=lfs diff=lfs merge=lfs -text
+*.flac filter=lfs diff=lfs merge=lfs -text
+*.mp3 filter=lfs diff=lfs merge=lfs -text
+*.ogg filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+# Image files - uncompressed
+*.bmp filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.tiff filter=lfs diff=lfs merge=lfs -text
+# Image files - compressed
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.webp filter=lfs diff=lfs merge=lfs -text
diff --git a/eval-queue/01-ai/Yi-1.5-9B-32K_eval_request_False_float16_Original.json b/eval-queue/01-ai/Yi-1.5-9B-32K_eval_request_False_float16_Original.json
new file mode 100644
index 0000000000000000000000000000000000000000..0e4f2e315d739b98393c777b9a806f1ac5a840a9
--- /dev/null
+++ b/eval-queue/01-ai/Yi-1.5-9B-32K_eval_request_False_float16_Original.json
@@ -0,0 +1,14 @@
+{
+  "model": "01-ai/Yi-1.5-9B-32K",
+  "base_model": "",
+  "revision": "c0239dbc923b8a2b5ca849763bdd592d39c60850",
+  "private": false,
+  "precision": "float16",
+  "weight_type": "Original",
+  "status": "FINISHED",
+  "submitted_time": "2024-07-29T13:10:13Z",
+  "model_type": "\ud83d\udfe2 : pretrained",
+  "likes": 18,
+  "params": 8.829,
+  "license": "apache-2.0"
+}
\ No newline at end of file
diff --git a/eval-queue/BioMistral/BioMistral-7B_eval_request_False_float16_Original.json b/eval-queue/BioMistral/BioMistral-7B_eval_request_False_float16_Original.json
new file mode 100644
index 0000000000000000000000000000000000000000..4f1d37cdaa1a844889dc0b86f1d37987eac18d17
--- /dev/null
+++ b/eval-queue/BioMistral/BioMistral-7B_eval_request_False_float16_Original.json
@@ -0,0 +1,15 @@
+{
+    "model": "BioMistral/BioMistral-7B",
+    "base_model": "",
+    "revision": "main",
+    "private": false,
+    "precision": "float16",
+    "weight_type": "Original",
+    "status": "FINISHED",
+    "submitted_time": "2024-05-30 01:33:58",
+    "model_type": "\u2b55 : instruction-tuned",
+    "job_id": "2031",
+    "params": 7.0,
+    "likes": 354,
+    "license": "apache-2.0"
+}
\ No newline at end of file
diff --git a/eval-queue/EleutherAI/polyglot-ko-1.3b_eval_request_False_float16_Original.json b/eval-queue/EleutherAI/polyglot-ko-1.3b_eval_request_False_float16_Original.json
new file mode 100644
index 0000000000000000000000000000000000000000..e9bdc0c2fb5a5f1325934e777c01ca6674591358
--- /dev/null
+++ b/eval-queue/EleutherAI/polyglot-ko-1.3b_eval_request_False_float16_Original.json
@@ -0,0 +1,14 @@
+{
+  "model": "EleutherAI/polyglot-ko-1.3b",
+  "base_model": "",
+  "revision": "main",
+  "private": false,
+  "precision": "float16",
+  "weight_type": "Original",
+  "status": "FINISHED",
+  "submitted_time": "2024-07-25T11:04:40Z",
+  "model_type": "\ud83d\udfe2 : pretrained",
+  "likes": 71,
+  "params": 1.432,
+  "license": "apache-2.0"
+}
\ No newline at end of file
diff --git a/eval-queue/HuggingFaceH4/zephyr-7b-beta_eval_request_False_float16_Original.json b/eval-queue/HuggingFaceH4/zephyr-7b-beta_eval_request_False_float16_Original.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c06f98202b82aab66fd3f351c7eb3849b9b9d32
--- /dev/null
+++ b/eval-queue/HuggingFaceH4/zephyr-7b-beta_eval_request_False_float16_Original.json
@@ -0,0 +1,15 @@
+{
+    "model": "HuggingFaceH4/zephyr-7b-beta",
+    "base_model": "",
+    "revision": "main",
+    "private": false,
+    "precision": "float16",
+    "weight_type": "Original",
+    "status": "FINISHED",
+    "submitted_time": "2023-11-01 04:21:47",
+    "model_type": "\u2b55 : instruction-tuned",
+    "job_id": "401",
+    "params": 7.242,
+    "likes": 1162,
+    "license": "mit"
+}
\ No newline at end of file
diff --git a/eval-queue/README.md b/eval-queue/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7b95401dc46245ac339fc25059d4a56d90b4cde5
--- /dev/null
+++ b/eval-queue/README.md
@@ -0,0 +1,3 @@
+---
+license: apache-2.0
+---
diff --git a/eval-queue/nlpai-lab/KULLM3_eval_request_False_float16_Original.json b/eval-queue/nlpai-lab/KULLM3_eval_request_False_float16_Original.json
new file mode 100644
index 0000000000000000000000000000000000000000..c04269a4cb1f73140fcb3ef175dd8a784955b71a
--- /dev/null
+++ b/eval-queue/nlpai-lab/KULLM3_eval_request_False_float16_Original.json
@@ -0,0 +1,15 @@
+{
+    "model": "nlpai-lab/KULLM3",
+    "base_model": "",
+    "revision": "main",
+    "private": false,
+    "precision": "float16",
+    "weight_type": "Original",
+    "status": "FINISHED",
+    "submitted_time": "2024-04-08 05:16:47",
+    "model_type": "\u2b55 : instruction-tuned",
+    "job_id": "1751",
+    "params": 10.732000350952148,
+    "likes": 13,
+    "license": "cc-by-nc-4.0"
+}
diff --git a/eval-queue/x2bee/POLAR-14B-DPO-v1.3_eval_request_False_float16_Original.json b/eval-queue/x2bee/POLAR-14B-DPO-v1.3_eval_request_False_float16_Original.json
new file mode 100644
index 0000000000000000000000000000000000000000..2ef469f131f52aba9274a2681184555924cc23ef
--- /dev/null
+++ b/eval-queue/x2bee/POLAR-14B-DPO-v1.3_eval_request_False_float16_Original.json
@@ -0,0 +1,15 @@
+{
+    "model": "x2bee/POLAR-14B-DPO-v1.3",
+    "base_model": "",
+    "revision": "main",
+    "private": false,
+    "precision": "float16",
+    "weight_type": "Original",
+    "status": "FINISHED",
+    "submitted_time": "2024-05-23 11:59:50",
+    "model_type": "\u2b55 : instruction-tuned",
+    "job_id": "1987",
+    "params": 14.220999717712402,
+    "likes": 0,
+    "license": "apache-2.0"
+}
\ No newline at end of file
diff --git a/eval-queue/x2bee/POLAR-14B-DPO-v1.4_eval_request_False_float16_Original.json b/eval-queue/x2bee/POLAR-14B-DPO-v1.4_eval_request_False_float16_Original.json
new file mode 100644
index 0000000000000000000000000000000000000000..7c9f43b54371ecc812142ccabeb144da0cb8e746
--- /dev/null
+++ b/eval-queue/x2bee/POLAR-14B-DPO-v1.4_eval_request_False_float16_Original.json
@@ -0,0 +1,15 @@
+{
+    "model": "x2bee/POLAR-14B-DPO-v1.4",
+    "base_model": "",
+    "revision": "main",
+    "private": false,
+    "precision": "float16",
+    "weight_type": "Original",
+    "status": "FINISHED",
+    "submitted_time": "2024-05-27 15:02:47",
+    "model_type": "\u2b55 : instruction-tuned",
+    "job_id": "2004",
+    "params": 14.220999717712402,
+    "likes": 0,
+    "license": "apache-2.0"
+}
\ No newline at end of file
diff --git a/eval-queue/x2bee/POLAR-14B-HES-DPO-v1.5_eval_request_False_float16_Original.json b/eval-queue/x2bee/POLAR-14B-HES-DPO-v1.5_eval_request_False_float16_Original.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ab7a7823d68a62943fb46fe594bc2af66aea731
--- /dev/null
+++ b/eval-queue/x2bee/POLAR-14B-HES-DPO-v1.5_eval_request_False_float16_Original.json
@@ -0,0 +1,15 @@
+{
+    "model": "x2bee/POLAR-14B-HES-DPO-v1.5",
+    "base_model": "",
+    "revision": "main",
+    "private": false,
+    "precision": "float16",
+    "weight_type": "Original",
+    "status": "FINISHED",
+    "submitted_time": "2024-05-29 23:53:33",
+    "model_type": "\u2b55 : instruction-tuned",
+    "job_id": "2029",
+    "params": 14.220999717712402,
+    "likes": 0,
+    "license": "apache-2.0"
+}
\ No newline at end of file
diff --git a/eval-queue/x2bee/POLAR-14B-SON-SFT-v0.1_eval_request_False_float16_Original.json b/eval-queue/x2bee/POLAR-14B-SON-SFT-v0.1_eval_request_False_float16_Original.json
new file mode 100644
index 0000000000000000000000000000000000000000..e05dcf514240d93da66d455500e83888cb398899
--- /dev/null
+++ b/eval-queue/x2bee/POLAR-14B-SON-SFT-v0.1_eval_request_False_float16_Original.json
@@ -0,0 +1,15 @@
+{
+    "model": "x2bee/POLAR-14B-SON-SFT-v0.1",
+    "base_model": "x2bee/POLAR-14B-v0.2",
+    "revision": "main",
+    "private": false,
+    "precision": "float16",
+    "weight_type": "Original",
+    "status": "FINISHED",
+    "submitted_time": "2024-05-27 13:52:58",
+    "model_type": "\u2b55 : instruction-tuned",
+    "job_id": "2003",
+    "params": 14.220999717712402,
+    "likes": 0,
+    "license": "apache-2.0"
+}
\ No newline at end of file
diff --git a/eval-queue/x2bee/POLAR-14B-v0.2_eval_request_False_float16_Original.json b/eval-queue/x2bee/POLAR-14B-v0.2_eval_request_False_float16_Original.json
new file mode 100644
index 0000000000000000000000000000000000000000..4bd6f0ba567b6ddb914195db30cb389d37b987e1
--- /dev/null
+++ b/eval-queue/x2bee/POLAR-14B-v0.2_eval_request_False_float16_Original.json
@@ -0,0 +1,15 @@
+{
+    "model": "x2bee/POLAR-14B-v0.2",
+    "base_model": "",
+    "revision": "main",
+    "private": false,
+    "precision": "float16",
+    "weight_type": "Original",
+    "status": "FINISHED",
+    "submitted_time": "2024-05-02 00:34:33",
+    "model_type": "\ud83d\udfe2 : pretrained",
+    "job_id": "1874",
+    "params": 14.220999717712402,
+    "likes": 0,
+    "license": "apache-2.0"
+}
\ No newline at end of file
diff --git a/eval-queue/x2bee/POLAR-14B-v0.5_eval_request_False_float16_Original.json b/eval-queue/x2bee/POLAR-14B-v0.5_eval_request_False_float16_Original.json
new file mode 100644
index 0000000000000000000000000000000000000000..4fae812cffc7b05a196cb62a7d3f7bb5d6c1aac8
--- /dev/null
+++ b/eval-queue/x2bee/POLAR-14B-v0.5_eval_request_False_float16_Original.json
@@ -0,0 +1,15 @@
+{
+    "model": "x2bee/POLAR-14B-v0.5",
+    "base_model": "",
+    "revision": "main",
+    "private": false,
+    "precision": "float16",
+    "weight_type": "Original",
+    "status": "FINISHED",
+    "submitted_time": "2024-06-05 00:49:59",
+    "model_type": "\ud83d\udfe2 : pretrained",
+    "job_id": "2041",
+    "params": 14.220999717712402,
+    "likes": 0,
+    "license": "apache-2.0"
+}
\ No newline at end of file
diff --git a/eval-results/.gitattributes b/eval-results/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..28df5f900b358436f0267334b3e3e9af33f917ba
--- /dev/null
+++ b/eval-results/.gitattributes
@@ -0,0 +1,55 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.lz4 filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+# Audio files - uncompressed
+*.pcm filter=lfs diff=lfs merge=lfs -text
+*.sam filter=lfs diff=lfs merge=lfs -text
+*.raw filter=lfs diff=lfs merge=lfs -text
+# Audio files - compressed
+*.aac filter=lfs diff=lfs merge=lfs -text
+*.flac filter=lfs diff=lfs merge=lfs -text
+*.mp3 filter=lfs diff=lfs merge=lfs -text
+*.ogg filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+# Image files - uncompressed
+*.bmp filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.tiff filter=lfs diff=lfs merge=lfs -text
+# Image files - compressed
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.webp filter=lfs diff=lfs merge=lfs -text
diff --git a/eval-results/01-ai/Yi-1.5-9B-32K/result_2024_07_30 20:36:30.json b/eval-results/01-ai/Yi-1.5-9B-32K/result_2024_07_30 20:36:30.json
new file mode 100644
index 0000000000000000000000000000000000000000..f01f8a60f89596f15703a48706a96da80d000f73
--- /dev/null
+++ b/eval-results/01-ai/Yi-1.5-9B-32K/result_2024_07_30 20:36:30.json	
@@ -0,0 +1,450 @@
+{
+    "results": {
+        "daily": {
+          "daily": 7
+        },
+        "quarterly": {
+          "quarterly": 7
+        },
+        "harness|arc_challenge|25": {
+            "acc": 0.29948805460750855,
+            "acc_stderr": 0.013385021637313567,
+            "acc_norm": 0.3506825938566553,
+            "acc_norm_stderr": 0.013944635930726089
+        },
+        "harness|hellaswag|10": {
+            "acc": 0.3333001394144593,
+            "acc_stderr": 0.004704293898729902,
+            "acc_norm": 0.4137621987651862,
+            "acc_norm_stderr": 0.004915003499517831
+        },
+        "harness|mmlu_world_religions|5": {
+            "acc": 0.47953216374269003,
+            "acc_stderr": 0.0383161053282193,
+            "acc_norm": 0.47953216374269003,
+            "acc_norm_stderr": 0.0383161053282193
+        },
+        "harness|mmlu_management|5": {
+            "acc": 0.5631067961165048,
+            "acc_stderr": 0.049111471073657764,
+            "acc_norm": 0.5631067961165048,
+            "acc_norm_stderr": 0.049111471073657764
+        },
+        "harness|mmlu_miscellaneous|5": {
+            "acc": 0.47509578544061304,
+            "acc_stderr": 0.01785777070490102,
+            "acc_norm": 0.47509578544061304,
+            "acc_norm_stderr": 0.01785777070490102
+        },
+        "harness|mmlu_anatomy|5": {
+            "acc": 0.28888888888888886,
+            "acc_stderr": 0.0391545063041425,
+            "acc_norm": 0.28888888888888886,
+            "acc_norm_stderr": 0.0391545063041425
+        },
+        "harness|mmlu_abstract_algebra|5": {
+            "acc": 0.31,
+            "acc_stderr": 0.04648231987117316,
+            "acc_norm": 0.31,
+            "acc_norm_stderr": 0.04648231987117316
+        },
+        "harness|mmlu_conceptual_physics|5": {
+            "acc": 0.46808510638297873,
+            "acc_stderr": 0.03261936918467382,
+            "acc_norm": 0.46808510638297873,
+            "acc_norm_stderr": 0.03261936918467382
+        },
+        "harness|mmlu_virology|5": {
+            "acc": 0.45180722891566266,
+            "acc_stderr": 0.03874371556587953,
+            "acc_norm": 0.45180722891566266,
+            "acc_norm_stderr": 0.03874371556587953
+        },
+        "harness|mmlu_philosophy|5": {
+            "acc": 0.47266881028938906,
+            "acc_stderr": 0.028355633568328188,
+            "acc_norm": 0.47266881028938906,
+            "acc_norm_stderr": 0.028355633568328188
+        },
+        "harness|mmlu_human_aging|5": {
+            "acc": 0.45739910313901344,
+            "acc_stderr": 0.033435777055830646,
+            "acc_norm": 0.45739910313901344,
+            "acc_norm_stderr": 0.033435777055830646
+        },
+        "harness|mmlu_human_sexuality|5": {
+            "acc": 0.5267175572519084,
+            "acc_stderr": 0.04379024936553894,
+            "acc_norm": 0.5267175572519084,
+            "acc_norm_stderr": 0.04379024936553894
+        },
+        "harness|mmlu_medical_genetics|5": {
+            "acc": 0.39,
+            "acc_stderr": 0.04902071300001975,
+            "acc_norm": 0.39,
+            "acc_norm_stderr": 0.04902071300001975
+        },
+        "harness|mmlu_high_school_geography|5": {
+            "acc": 0.5555555555555556,
+            "acc_stderr": 0.035402943770953675,
+            "acc_norm": 0.5555555555555556,
+            "acc_norm_stderr": 0.035402943770953675
+        },
+        "harness|mmlu_electrical_engineering|5": {
+            "acc": 0.5724137931034483,
+            "acc_stderr": 0.04122737111370332,
+            "acc_norm": 0.5724137931034483,
+            "acc_norm_stderr": 0.04122737111370332
+        },
+        "harness|mmlu_college_physics|5": {
+            "acc": 0.3137254901960784,
+            "acc_stderr": 0.04617034827006716,
+            "acc_norm": 0.3137254901960784,
+            "acc_norm_stderr": 0.04617034827006716
+        },
+        "harness|mmlu_high_school_microeconomics|5": {
+            "acc": 0.5,
+            "acc_stderr": 0.032478490123081544,
+            "acc_norm": 0.5,
+            "acc_norm_stderr": 0.032478490123081544
+        },
+        "harness|mmlu_high_school_macroeconomics|5": {
+            "acc": 0.47692307692307695,
+            "acc_stderr": 0.025323990861736125,
+            "acc_norm": 0.47692307692307695,
+            "acc_norm_stderr": 0.025323990861736125
+        },
+        "harness|mmlu_computer_security|5": {
+            "acc": 0.53,
+            "acc_stderr": 0.05016135580465919,
+            "acc_norm": 0.53,
+            "acc_norm_stderr": 0.05016135580465919
+        },
+        "harness|mmlu_global_facts|5": {
+            "acc": 0.37,
+            "acc_stderr": 0.048523658709391,
+            "acc_norm": 0.37,
+            "acc_norm_stderr": 0.048523658709391
+        },
+        "harness|mmlu_jurisprudence|5": {
+            "acc": 0.5740740740740741,
+            "acc_stderr": 0.047803436269367894,
+            "acc_norm": 0.5740740740740741,
+            "acc_norm_stderr": 0.047803436269367894
+        },
+        "harness|mmlu_high_school_chemistry|5": {
+            "acc": 0.4187192118226601,
+            "acc_stderr": 0.03471192860518468,
+            "acc_norm": 0.4187192118226601,
+            "acc_norm_stderr": 0.03471192860518468
+        },
+        "harness|mmlu_high_school_biology|5": {
+            "acc": 0.47419354838709676,
+            "acc_stderr": 0.02840609505765332,
+            "acc_norm": 0.47419354838709676,
+            "acc_norm_stderr": 0.02840609505765332
+        },
+        "harness|mmlu_marketing|5": {
+            "acc": 0.6752136752136753,
+            "acc_stderr": 0.03067902276549883,
+            "acc_norm": 0.6752136752136753,
+            "acc_norm_stderr": 0.03067902276549883
+        },
+        "harness|mmlu_clinical_knowledge|5": {
+            "acc": 0.44150943396226416,
+            "acc_stderr": 0.030561590426731833,
+            "acc_norm": 0.44150943396226416,
+            "acc_norm_stderr": 0.030561590426731833
+        },
+        "harness|mmlu_public_relations|5": {
+            "acc": 0.4727272727272727,
+            "acc_stderr": 0.04782001791380063,
+            "acc_norm": 0.4727272727272727,
+            "acc_norm_stderr": 0.04782001791380063
+        },
+        "harness|mmlu_high_school_mathematics|5": {
+            "acc": 0.4185185185185185,
+            "acc_stderr": 0.030078013075022066,
+            "acc_norm": 0.4185185185185185,
+            "acc_norm_stderr": 0.030078013075022066
+        },
+        "harness|mmlu_high_school_physics|5": {
+            "acc": 0.304635761589404,
+            "acc_stderr": 0.03757949922943343,
+            "acc_norm": 0.304635761589404,
+            "acc_norm_stderr": 0.03757949922943343
+        },
+        "harness|mmlu_sociology|5": {
+            "acc": 0.6069651741293532,
+            "acc_stderr": 0.0345368246603156,
+            "acc_norm": 0.6069651741293532,
+            "acc_norm_stderr": 0.0345368246603156
+        },
+        "harness|mmlu_college_medicine|5": {
+            "acc": 0.4046242774566474,
+            "acc_stderr": 0.03742461193887248,
+            "acc_norm": 0.4046242774566474,
+            "acc_norm_stderr": 0.03742461193887248
+        },
+        "harness|mmlu_elementary_mathematics|5": {
+            "acc": 0.5476190476190477,
+            "acc_stderr": 0.02563425811555495,
+            "acc_norm": 0.5476190476190477,
+            "acc_norm_stderr": 0.02563425811555495
+        },
+        "harness|mmlu_college_biology|5": {
+            "acc": 0.3472222222222222,
+            "acc_stderr": 0.039812405437178615,
+            "acc_norm": 0.3472222222222222,
+            "acc_norm_stderr": 0.039812405437178615
+        },
+        "harness|mmlu_college_chemistry|5": {
+            "acc": 0.33,
+            "acc_stderr": 0.04725815626252605,
+            "acc_norm": 0.33,
+            "acc_norm_stderr": 0.04725815626252605
+        },
+        "harness|mmlu_us_foreign_policy|5": {
+            "acc": 0.57,
+            "acc_stderr": 0.04975698519562426,
+            "acc_norm": 0.57,
+            "acc_norm_stderr": 0.04975698519562426
+        },
+        "harness|mmlu_moral_disputes|5": {
+            "acc": 0.49710982658959535,
+            "acc_stderr": 0.026918645383239015,
+            "acc_norm": 0.49710982658959535,
+            "acc_norm_stderr": 0.026918645383239015
+        },
+        "harness|mmlu_logical_fallacies|5": {
+            "acc": 0.5276073619631901,
+            "acc_stderr": 0.03922378290610991,
+            "acc_norm": 0.5276073619631901,
+            "acc_norm_stderr": 0.03922378290610991
+        },
+        "harness|mmlu_prehistory|5": {
+            "acc": 0.49691358024691357,
+            "acc_stderr": 0.027820214158594377,
+            "acc_norm": 0.49691358024691357,
+            "acc_norm_stderr": 0.027820214158594377
+        },
+        "harness|mmlu_college_mathematics|5": {
+            "acc": 0.45,
+            "acc_stderr": 0.05,
+            "acc_norm": 0.45,
+            "acc_norm_stderr": 0.05
+        },
+        "harness|mmlu_high_school_government_and_politics|5": {
+            "acc": 0.49222797927461137,
+            "acc_stderr": 0.03608003225569654,
+            "acc_norm": 0.49222797927461137,
+            "acc_norm_stderr": 0.03608003225569654
+        },
+        "harness|mmlu_econometrics|5": {
+            "acc": 0.41228070175438597,
+            "acc_stderr": 0.046306532033665956,
+            "acc_norm": 0.41228070175438597,
+            "acc_norm_stderr": 0.046306532033665956
+        },
+        "harness|mmlu_high_school_psychology|5": {
+            "acc": 0.5027522935779817,
+            "acc_stderr": 0.02143699835976532,
+            "acc_norm": 0.5027522935779817,
+            "acc_norm_stderr": 0.02143699835976532
+        },
+        "harness|mmlu_formal_logic|5": {
+            "acc": 0.40476190476190477,
+            "acc_stderr": 0.04390259265377561,
+            "acc_norm": 0.40476190476190477,
+            "acc_norm_stderr": 0.04390259265377561
+        },
+        "harness|mmlu_nutrition|5": {
+            "acc": 0.49019607843137253,
+            "acc_stderr": 0.028624412550167958,
+            "acc_norm": 0.49019607843137253,
+            "acc_norm_stderr": 0.028624412550167958
+        },
+        "harness|mmlu_business_ethics|5": {
+            "acc": 0.5,
+            "acc_stderr": 0.050251890762960605,
+            "acc_norm": 0.5,
+            "acc_norm_stderr": 0.050251890762960605
+        },
+        "harness|mmlu_international_law|5": {
+            "acc": 0.7355371900826446,
+            "acc_stderr": 0.04026187527591205,
+            "acc_norm": 0.7355371900826446,
+            "acc_norm_stderr": 0.04026187527591205
+        },
+        "harness|mmlu_astronomy|5": {
+            "acc": 0.45394736842105265,
+            "acc_stderr": 0.04051646342874142,
+            "acc_norm": 0.45394736842105265,
+            "acc_norm_stderr": 0.04051646342874142
+        },
+        "harness|mmlu_professional_psychology|5": {
+            "acc": 0.39705882352941174,
+            "acc_stderr": 0.019794488900024113,
+            "acc_norm": 0.39705882352941174,
+            "acc_norm_stderr": 0.019794488900024113
+        },
+        "harness|mmlu_professional_accounting|5": {
+            "acc": 0.40070921985815605,
+            "acc_stderr": 0.029233465745573086,
+            "acc_norm": 0.40070921985815605,
+            "acc_norm_stderr": 0.029233465745573086
+        },
+        "harness|mmlu_machine_learning|5": {
+            "acc": 0.39285714285714285,
+            "acc_stderr": 0.04635550135609976,
+            "acc_norm": 0.39285714285714285,
+            "acc_norm_stderr": 0.04635550135609976
+        },
+        "harness|mmlu_high_school_statistics|5": {
+            "acc": 0.4675925925925926,
+            "acc_stderr": 0.034028015813589656,
+            "acc_norm": 0.4675925925925926,
+            "acc_norm_stderr": 0.034028015813589656
+        },
+        "harness|mmlu_moral_scenarios|5": {
+            "acc": 0.3329608938547486,
+            "acc_stderr": 0.015761716178397552,
+            "acc_norm": 0.3329608938547486,
+            "acc_norm_stderr": 0.015761716178397552
+        },
+        "harness|mmlu_college_computer_science|5": {
+            "acc": 0.43,
+            "acc_stderr": 0.049756985195624284,
+            "acc_norm": 0.43,
+            "acc_norm_stderr": 0.049756985195624284
+        },
+        "harness|mmlu_high_school_computer_science|5": {
+            "acc": 0.76,
+            "acc_stderr": 0.042923469599092816,
+            "acc_norm": 0.76,
+            "acc_norm_stderr": 0.042923469599092816
+        },
+        "harness|mmlu_professional_medicine|5": {
+            "acc": 0.35294117647058826,
+            "acc_stderr": 0.029029422815681404,
+            "acc_norm": 0.35294117647058826,
+            "acc_norm_stderr": 0.029029422815681404
+        },
+        "harness|mmlu_security_studies|5": {
+            "acc": 0.6163265306122448,
+            "acc_stderr": 0.031130880396235943,
+            "acc_norm": 0.6163265306122448,
+            "acc_norm_stderr": 0.031130880396235943
+        },
+        "harness|mmlu_high_school_world_history|5": {
+            "acc": 0.5654008438818565,
+            "acc_stderr": 0.03226759995510145,
+            "acc_norm": 0.5654008438818565,
+            "acc_norm_stderr": 0.03226759995510145
+        },
+        "harness|mmlu_professional_law|5": {
+            "acc": 0.36571056062581486,
+            "acc_stderr": 0.012301028188840567,
+            "acc_norm": 0.36571056062581486,
+            "acc_norm_stderr": 0.012301028188840567
+        },
+        "harness|mmlu_high_school_us_history|5": {
+            "acc": 0.4852941176470588,
+            "acc_stderr": 0.03507793834791324,
+            "acc_norm": 0.4852941176470588,
+            "acc_norm_stderr": 0.03507793834791324
+        },
+        "harness|mmlu_high_school_european_history|5": {
+            "acc": 0.5151515151515151,
+            "acc_stderr": 0.03902551007374448,
+            "acc_norm": 0.5151515151515151,
+            "acc_norm_stderr": 0.03902551007374448
+        },
+        "harness|truthfulqa_mc|0": {
+            "mc1": 0.2937576499388005,
+            "mc1_stderr": 0.015945068581236614,
+            "mc2": 0.4670848140389129,
+            "mc2_stderr": 0.01585178282587417
+        },
+        "harness|commongen_v2|2": {
+            "acc": 0.47107438016528924,
+            "acc_stderr": 0.017161563949916348,
+            "acc_norm": 0.5171192443919717,
+            "acc_norm_stderr": 0.017180275246085626
+        }
+    },
+    "versions": {
+        "all": 0,
+        "harness|arc_challenge|25": 0,
+        "harness|hellaswag|10": 0,
+        "harness|mmlu_world_religions|5": 1,
+        "harness|mmlu_management|5": 1,
+        "harness|mmlu_miscellaneous|5": 1,
+        "harness|mmlu_anatomy|5": 1,
+        "harness|mmlu_abstract_algebra|5": 1,
+        "harness|mmlu_conceptual_physics|5": 1,
+        "harness|mmlu_virology|5": 1,
+        "harness|mmlu_philosophy|5": 1,
+        "harness|mmlu_human_aging|5": 1,
+        "harness|mmlu_human_sexuality|5": 1,
+        "harness|mmlu_medical_genetics|5": 1,
+        "harness|mmlu_high_school_geography|5": 1,
+        "harness|mmlu_electrical_engineering|5": 1,
+        "harness|mmlu_college_physics|5": 1,
+        "harness|mmlu_high_school_microeconomics|5": 1,
+        "harness|mmlu_high_school_macroeconomics|5": 1,
+        "harness|mmlu_computer_security|5": 1,
+        "harness|mmlu_global_facts|5": 1,
+        "harness|mmlu_jurisprudence|5": 1,
+        "harness|mmlu_high_school_chemistry|5": 1,
+        "harness|mmlu_high_school_biology|5": 1,
+        "harness|mmlu_marketing|5": 1,
+        "harness|mmlu_clinical_knowledge|5": 1,
+        "harness|mmlu_public_relations|5": 1,
+        "harness|mmlu_high_school_mathematics|5": 1,
+        "harness|mmlu_high_school_physics|5": 1,
+        "harness|mmlu_sociology|5": 1,
+        "harness|mmlu_college_medicine|5": 1,
+        "harness|mmlu_elementary_mathematics|5": 1,
+        "harness|mmlu_college_biology|5": 1,
+        "harness|mmlu_college_chemistry|5": 1,
+        "harness|mmlu_us_foreign_policy|5": 1,
+        "harness|mmlu_moral_disputes|5": 1,
+        "harness|mmlu_logical_fallacies|5": 1,
+        "harness|mmlu_prehistory|5": 1,
+        "harness|mmlu_college_mathematics|5": 1,
+        "harness|mmlu_high_school_government_and_politics|5": 1,
+        "harness|mmlu_econometrics|5": 1,
+        "harness|mmlu_high_school_psychology|5": 1,
+        "harness|mmlu_formal_logic|5": 1,
+        "harness|mmlu_nutrition|5": 1,
+        "harness|mmlu_business_ethics|5": 1,
+        "harness|mmlu_international_law|5": 1,
+        "harness|mmlu_astronomy|5": 1,
+        "harness|mmlu_professional_psychology|5": 1,
+        "harness|mmlu_professional_accounting|5": 1,
+        "harness|mmlu_machine_learning|5": 1,
+        "harness|mmlu_high_school_statistics|5": 1,
+        "harness|mmlu_moral_scenarios|5": 1,
+        "harness|mmlu_college_computer_science|5": 1,
+        "harness|mmlu_high_school_computer_science|5": 1,
+        "harness|mmlu_professional_medicine|5": 1,
+        "harness|mmlu_security_studies|5": 1,
+        "harness|mmlu_high_school_world_history|5": 1,
+        "harness|mmlu_professional_law|5": 1,
+        "harness|mmlu_high_school_us_history|5": 1,
+        "harness|mmlu_high_school_european_history|5": 1,
+        "harness|truthfulqa_mc|0": 0,
+        "harness|commongen_v2|2": 1
+    },
+    "config_general": {
+        "model_name": "01-ai/Yi-1.5-9B-32K",
+        "model_sha": "c0239dbc923b8a2b5ca849763bdd592d39c60850",
+        "model_dtype": "torch.float16",
+        "lighteval_sha": "",
+        "num_few_shot_default": 0,
+        "num_fewshot_seeds": 1,
+        "override_batch_size": 1,
+        "max_samples": null
+    }
+}
\ No newline at end of file
diff --git a/eval-results/BioMistral/BioMistral-7B/BioMistral_BioMistral-7B_result_2024-05-30 01_33_58.json b/eval-results/BioMistral/BioMistral-7B/BioMistral_BioMistral-7B_result_2024-05-30 01_33_58.json
new file mode 100644
index 0000000000000000000000000000000000000000..84b56b65cf0b5b96038c5186afc4f253243511cb
--- /dev/null
+++ b/eval-results/BioMistral/BioMistral-7B/BioMistral_BioMistral-7B_result_2024-05-30 01_33_58.json	
@@ -0,0 +1,450 @@
+{
+    "results": {
+        "daily": {
+          "daily": 10
+        },
+        "quarterly": {
+          "quarterly": 10
+        },
+        "harness|arc_challenge|25": {
+            "acc": 0.257679180887372,
+            "acc_stderr": 0.012780770562768416,
+            "acc_norm": 0.3122866894197952,
+            "acc_norm_stderr": 0.013542598541688065
+        },
+        "harness|hellaswag|10": {
+            "acc": 0.3229436367257518,
+            "acc_stderr": 0.004666457279979418,
+            "acc_norm": 0.39255128460466043,
+            "acc_norm_stderr": 0.004873203269366306
+        },
+        "harness|mmlu_world_religions|5": {
+            "acc": 0.34502923976608185,
+            "acc_stderr": 0.036459813773888065,
+            "acc_norm": 0.34502923976608185,
+            "acc_norm_stderr": 0.036459813773888065
+        },
+        "harness|mmlu_management|5": {
+            "acc": 0.4368932038834951,
+            "acc_stderr": 0.04911147107365778,
+            "acc_norm": 0.4368932038834951,
+            "acc_norm_stderr": 0.04911147107365778
+        },
+        "harness|mmlu_miscellaneous|5": {
+            "acc": 0.3780332056194125,
+            "acc_stderr": 0.017339844462104625,
+            "acc_norm": 0.3780332056194125,
+            "acc_norm_stderr": 0.017339844462104625
+        },
+        "harness|mmlu_anatomy|5": {
+            "acc": 0.3037037037037037,
+            "acc_stderr": 0.039725528847851355,
+            "acc_norm": 0.3037037037037037,
+            "acc_norm_stderr": 0.039725528847851355
+        },
+        "harness|mmlu_abstract_algebra|5": {
+            "acc": 0.37,
+            "acc_stderr": 0.04852365870939099,
+            "acc_norm": 0.37,
+            "acc_norm_stderr": 0.04852365870939099
+        },
+        "harness|mmlu_conceptual_physics|5": {
+            "acc": 0.28085106382978725,
+            "acc_stderr": 0.02937917046412482,
+            "acc_norm": 0.28085106382978725,
+            "acc_norm_stderr": 0.02937917046412482
+        },
+        "harness|mmlu_virology|5": {
+            "acc": 0.3373493975903614,
+            "acc_stderr": 0.03680783690727581,
+            "acc_norm": 0.3373493975903614,
+            "acc_norm_stderr": 0.03680783690727581
+        },
+        "harness|mmlu_philosophy|5": {
+            "acc": 0.3954983922829582,
+            "acc_stderr": 0.027770918531427838,
+            "acc_norm": 0.3954983922829582,
+            "acc_norm_stderr": 0.027770918531427838
+        },
+        "harness|mmlu_human_aging|5": {
+            "acc": 0.34977578475336324,
+            "acc_stderr": 0.03200736719484503,
+            "acc_norm": 0.34977578475336324,
+            "acc_norm_stderr": 0.03200736719484503
+        },
+        "harness|mmlu_human_sexuality|5": {
+            "acc": 0.3969465648854962,
+            "acc_stderr": 0.04291135671009224,
+            "acc_norm": 0.3969465648854962,
+            "acc_norm_stderr": 0.04291135671009224
+        },
+        "harness|mmlu_medical_genetics|5": {
+            "acc": 0.42,
+            "acc_stderr": 0.049604496374885836,
+            "acc_norm": 0.42,
+            "acc_norm_stderr": 0.049604496374885836
+        },
+        "harness|mmlu_high_school_geography|5": {
+            "acc": 0.4292929292929293,
+            "acc_stderr": 0.03526552724601199,
+            "acc_norm": 0.4292929292929293,
+            "acc_norm_stderr": 0.03526552724601199
+        },
+        "harness|mmlu_electrical_engineering|5": {
+            "acc": 0.4,
+            "acc_stderr": 0.04082482904638628,
+            "acc_norm": 0.4,
+            "acc_norm_stderr": 0.04082482904638628
+        },
+        "harness|mmlu_college_physics|5": {
+            "acc": 0.30392156862745096,
+            "acc_stderr": 0.045766654032077636,
+            "acc_norm": 0.30392156862745096,
+            "acc_norm_stderr": 0.045766654032077636
+        },
+        "harness|mmlu_high_school_microeconomics|5": {
+            "acc": 0.40336134453781514,
+            "acc_stderr": 0.031866081214088314,
+            "acc_norm": 0.40336134453781514,
+            "acc_norm_stderr": 0.031866081214088314
+        },
+        "harness|mmlu_high_school_macroeconomics|5": {
+            "acc": 0.40512820512820513,
+            "acc_stderr": 0.024890471769938145,
+            "acc_norm": 0.40512820512820513,
+            "acc_norm_stderr": 0.024890471769938145
+        },
+        "harness|mmlu_computer_security|5": {
+            "acc": 0.48,
+            "acc_stderr": 0.050211673156867795,
+            "acc_norm": 0.48,
+            "acc_norm_stderr": 0.050211673156867795
+        },
+        "harness|mmlu_global_facts|5": {
+            "acc": 0.32,
+            "acc_stderr": 0.04688261722621505,
+            "acc_norm": 0.32,
+            "acc_norm_stderr": 0.04688261722621505
+        },
+        "harness|mmlu_jurisprudence|5": {
+            "acc": 0.49074074074074076,
+            "acc_stderr": 0.04832853553437055,
+            "acc_norm": 0.49074074074074076,
+            "acc_norm_stderr": 0.04832853553437055
+        },
+        "harness|mmlu_high_school_chemistry|5": {
+            "acc": 0.37438423645320196,
+            "acc_stderr": 0.03405155380561952,
+            "acc_norm": 0.37438423645320196,
+            "acc_norm_stderr": 0.03405155380561952
+        },
+        "harness|mmlu_high_school_biology|5": {
+            "acc": 0.36774193548387096,
+            "acc_stderr": 0.027430866579973474,
+            "acc_norm": 0.36774193548387096,
+            "acc_norm_stderr": 0.027430866579973474
+        },
+        "harness|mmlu_marketing|5": {
+            "acc": 0.5598290598290598,
+            "acc_stderr": 0.0325207417206305,
+            "acc_norm": 0.5598290598290598,
+            "acc_norm_stderr": 0.0325207417206305
+        },
+        "harness|mmlu_clinical_knowledge|5": {
+            "acc": 0.3886792452830189,
+            "acc_stderr": 0.030000485448675986,
+            "acc_norm": 0.3886792452830189,
+            "acc_norm_stderr": 0.030000485448675986
+        },
+        "harness|mmlu_public_relations|5": {
+            "acc": 0.44545454545454544,
+            "acc_stderr": 0.047605488214603246,
+            "acc_norm": 0.44545454545454544,
+            "acc_norm_stderr": 0.047605488214603246
+        },
+        "harness|mmlu_high_school_mathematics|5": {
+            "acc": 0.34444444444444444,
+            "acc_stderr": 0.028972648884844267,
+            "acc_norm": 0.34444444444444444,
+            "acc_norm_stderr": 0.028972648884844267
+        },
+        "harness|mmlu_high_school_physics|5": {
+            "acc": 0.3443708609271523,
+            "acc_stderr": 0.038796870240733264,
+            "acc_norm": 0.3443708609271523,
+            "acc_norm_stderr": 0.038796870240733264
+        },
+        "harness|mmlu_sociology|5": {
+            "acc": 0.4577114427860697,
+            "acc_stderr": 0.035228658640995975,
+            "acc_norm": 0.4577114427860697,
+            "acc_norm_stderr": 0.035228658640995975
+        },
+        "harness|mmlu_college_medicine|5": {
+            "acc": 0.3815028901734104,
+            "acc_stderr": 0.03703851193099521,
+            "acc_norm": 0.3815028901734104,
+            "acc_norm_stderr": 0.03703851193099521
+        },
+        "harness|mmlu_elementary_mathematics|5": {
+            "acc": 0.35714285714285715,
+            "acc_stderr": 0.02467786284133278,
+            "acc_norm": 0.35714285714285715,
+            "acc_norm_stderr": 0.02467786284133278
+        },
+        "harness|mmlu_college_biology|5": {
+            "acc": 0.3333333333333333,
+            "acc_stderr": 0.03942082639927213,
+            "acc_norm": 0.3333333333333333,
+            "acc_norm_stderr": 0.03942082639927213
+        },
+        "harness|mmlu_college_chemistry|5": {
+            "acc": 0.47,
+            "acc_stderr": 0.05016135580465919,
+            "acc_norm": 0.47,
+            "acc_norm_stderr": 0.05016135580465919
+        },
+        "harness|mmlu_us_foreign_policy|5": {
+            "acc": 0.54,
+            "acc_stderr": 0.05009082659620333,
+            "acc_norm": 0.54,
+            "acc_norm_stderr": 0.05009082659620333
+        },
+        "harness|mmlu_moral_disputes|5": {
+            "acc": 0.44508670520231214,
+            "acc_stderr": 0.02675625512966377,
+            "acc_norm": 0.44508670520231214,
+            "acc_norm_stderr": 0.02675625512966377
+        },
+        "harness|mmlu_logical_fallacies|5": {
+            "acc": 0.34355828220858897,
+            "acc_stderr": 0.03731133519673893,
+            "acc_norm": 0.34355828220858897,
+            "acc_norm_stderr": 0.03731133519673893
+        },
+        "harness|mmlu_prehistory|5": {
+            "acc": 0.37037037037037035,
+            "acc_stderr": 0.02686949074481525,
+            "acc_norm": 0.37037037037037035,
+            "acc_norm_stderr": 0.02686949074481525
+        },
+        "harness|mmlu_college_mathematics|5": {
+            "acc": 0.33,
+            "acc_stderr": 0.04725815626252605,
+            "acc_norm": 0.33,
+            "acc_norm_stderr": 0.04725815626252605
+        },
+        "harness|mmlu_high_school_government_and_politics|5": {
+            "acc": 0.44559585492227977,
+            "acc_stderr": 0.0358701498607566,
+            "acc_norm": 0.44559585492227977,
+            "acc_norm_stderr": 0.0358701498607566
+        },
+        "harness|mmlu_econometrics|5": {
+            "acc": 0.2719298245614035,
+            "acc_stderr": 0.041857744240220575,
+            "acc_norm": 0.2719298245614035,
+            "acc_norm_stderr": 0.041857744240220575
+        },
+        "harness|mmlu_high_school_psychology|5": {
+            "acc": 0.3798165137614679,
+            "acc_stderr": 0.020808825617866244,
+            "acc_norm": 0.3798165137614679,
+            "acc_norm_stderr": 0.020808825617866244
+        },
+        "harness|mmlu_formal_logic|5": {
+            "acc": 0.3492063492063492,
+            "acc_stderr": 0.04263906892795132,
+            "acc_norm": 0.3492063492063492,
+            "acc_norm_stderr": 0.04263906892795132
+        },
+        "harness|mmlu_nutrition|5": {
+            "acc": 0.4117647058823529,
+            "acc_stderr": 0.02818059632825929,
+            "acc_norm": 0.4117647058823529,
+            "acc_norm_stderr": 0.02818059632825929
+        },
+        "harness|mmlu_business_ethics|5": {
+            "acc": 0.42,
+            "acc_stderr": 0.049604496374885836,
+            "acc_norm": 0.42,
+            "acc_norm_stderr": 0.049604496374885836
+        },
+        "harness|mmlu_international_law|5": {
+            "acc": 0.5619834710743802,
+            "acc_stderr": 0.045291468044357915,
+            "acc_norm": 0.5619834710743802,
+            "acc_norm_stderr": 0.045291468044357915
+        },
+        "harness|mmlu_astronomy|5": {
+            "acc": 0.34868421052631576,
+            "acc_stderr": 0.038781398887976125,
+            "acc_norm": 0.34868421052631576,
+            "acc_norm_stderr": 0.038781398887976125
+        },
+        "harness|mmlu_professional_psychology|5": {
+            "acc": 0.3284313725490196,
+            "acc_stderr": 0.018999707383162666,
+            "acc_norm": 0.3284313725490196,
+            "acc_norm_stderr": 0.018999707383162666
+        },
+        "harness|mmlu_professional_accounting|5": {
+            "acc": 0.2730496453900709,
+            "acc_stderr": 0.026577860943307857,
+            "acc_norm": 0.2730496453900709,
+            "acc_norm_stderr": 0.026577860943307857
+        },
+        "harness|mmlu_machine_learning|5": {
+            "acc": 0.2767857142857143,
+            "acc_stderr": 0.04246624336697627,
+            "acc_norm": 0.2767857142857143,
+            "acc_norm_stderr": 0.04246624336697627
+        },
+        "harness|mmlu_high_school_statistics|5": {
+            "acc": 0.4074074074074074,
+            "acc_stderr": 0.03350991604696043,
+            "acc_norm": 0.4074074074074074,
+            "acc_norm_stderr": 0.03350991604696043
+        },
+        "harness|mmlu_moral_scenarios|5": {
+            "acc": 0.23910614525139665,
+            "acc_stderr": 0.014265554192331149,
+            "acc_norm": 0.23910614525139665,
+            "acc_norm_stderr": 0.014265554192331149
+        },
+        "harness|mmlu_college_computer_science|5": {
+            "acc": 0.33,
+            "acc_stderr": 0.047258156262526045,
+            "acc_norm": 0.33,
+            "acc_norm_stderr": 0.047258156262526045
+        },
+        "harness|mmlu_high_school_computer_science|5": {
+            "acc": 0.4,
+            "acc_stderr": 0.04923659639173309,
+            "acc_norm": 0.4,
+            "acc_norm_stderr": 0.04923659639173309
+        },
+        "harness|mmlu_professional_medicine|5": {
+            "acc": 0.4227941176470588,
+            "acc_stderr": 0.030008562845003483,
+            "acc_norm": 0.4227941176470588,
+            "acc_norm_stderr": 0.030008562845003483
+        },
+        "harness|mmlu_security_studies|5": {
+            "acc": 0.3469387755102041,
+            "acc_stderr": 0.030472526026726492,
+            "acc_norm": 0.3469387755102041,
+            "acc_norm_stderr": 0.030472526026726492
+        },
+        "harness|mmlu_high_school_world_history|5": {
+            "acc": 0.4177215189873418,
+            "acc_stderr": 0.032103530322412685,
+            "acc_norm": 0.4177215189873418,
+            "acc_norm_stderr": 0.032103530322412685
+        },
+        "harness|mmlu_professional_law|5": {
+            "acc": 0.3005215123859192,
+            "acc_stderr": 0.011709918883039124,
+            "acc_norm": 0.3005215123859192,
+            "acc_norm_stderr": 0.011709918883039124
+        },
+        "harness|mmlu_high_school_us_history|5": {
+            "acc": 0.3872549019607843,
+            "acc_stderr": 0.03418931233833344,
+            "acc_norm": 0.3872549019607843,
+            "acc_norm_stderr": 0.03418931233833344
+        },
+        "harness|mmlu_high_school_european_history|5": {
+            "acc": 0.43636363636363634,
+            "acc_stderr": 0.03872592983524753,
+            "acc_norm": 0.43636363636363634,
+            "acc_norm_stderr": 0.03872592983524753
+        },
+        "harness|truthfulqa_mc|0": {
+            "mc1": 0.3072215422276622,
+            "mc1_stderr": 0.016150201321323002,
+            "mc2": 0.4721418472000992,
+            "mc2_stderr": 0.01626625866283201
+        },
+        "harness|commongen_v2|2": {
+            "acc": 0.27863046044864226,
+            "acc_stderr": 0.01541373949434568,
+            "acc_norm": 0.3825265643447462,
+            "acc_norm_stderr": 0.016709165387228803
+        }
+    },
+    "versions": {
+        "all": 0,
+        "harness|arc_challenge|25": 0,
+        "harness|hellaswag|10": 0,
+        "harness|mmlu_world_religions|5": 1,
+        "harness|mmlu_management|5": 1,
+        "harness|mmlu_miscellaneous|5": 1,
+        "harness|mmlu_anatomy|5": 1,
+        "harness|mmlu_abstract_algebra|5": 1,
+        "harness|mmlu_conceptual_physics|5": 1,
+        "harness|mmlu_virology|5": 1,
+        "harness|mmlu_philosophy|5": 1,
+        "harness|mmlu_human_aging|5": 1,
+        "harness|mmlu_human_sexuality|5": 1,
+        "harness|mmlu_medical_genetics|5": 1,
+        "harness|mmlu_high_school_geography|5": 1,
+        "harness|mmlu_electrical_engineering|5": 1,
+        "harness|mmlu_college_physics|5": 1,
+        "harness|mmlu_high_school_microeconomics|5": 1,
+        "harness|mmlu_high_school_macroeconomics|5": 1,
+        "harness|mmlu_computer_security|5": 1,
+        "harness|mmlu_global_facts|5": 1,
+        "harness|mmlu_jurisprudence|5": 1,
+        "harness|mmlu_high_school_chemistry|5": 1,
+        "harness|mmlu_high_school_biology|5": 1,
+        "harness|mmlu_marketing|5": 1,
+        "harness|mmlu_clinical_knowledge|5": 1,
+        "harness|mmlu_public_relations|5": 1,
+        "harness|mmlu_high_school_mathematics|5": 1,
+        "harness|mmlu_high_school_physics|5": 1,
+        "harness|mmlu_sociology|5": 1,
+        "harness|mmlu_college_medicine|5": 1,
+        "harness|mmlu_elementary_mathematics|5": 1,
+        "harness|mmlu_college_biology|5": 1,
+        "harness|mmlu_college_chemistry|5": 1,
+        "harness|mmlu_us_foreign_policy|5": 1,
+        "harness|mmlu_moral_disputes|5": 1,
+        "harness|mmlu_logical_fallacies|5": 1,
+        "harness|mmlu_prehistory|5": 1,
+        "harness|mmlu_college_mathematics|5": 1,
+        "harness|mmlu_high_school_government_and_politics|5": 1,
+        "harness|mmlu_econometrics|5": 1,
+        "harness|mmlu_high_school_psychology|5": 1,
+        "harness|mmlu_formal_logic|5": 1,
+        "harness|mmlu_nutrition|5": 1,
+        "harness|mmlu_business_ethics|5": 1,
+        "harness|mmlu_international_law|5": 1,
+        "harness|mmlu_astronomy|5": 1,
+        "harness|mmlu_professional_psychology|5": 1,
+        "harness|mmlu_professional_accounting|5": 1,
+        "harness|mmlu_machine_learning|5": 1,
+        "harness|mmlu_high_school_statistics|5": 1,
+        "harness|mmlu_moral_scenarios|5": 1,
+        "harness|mmlu_college_computer_science|5": 1,
+        "harness|mmlu_high_school_computer_science|5": 1,
+        "harness|mmlu_professional_medicine|5": 1,
+        "harness|mmlu_security_studies|5": 1,
+        "harness|mmlu_high_school_world_history|5": 1,
+        "harness|mmlu_professional_law|5": 1,
+        "harness|mmlu_high_school_us_history|5": 1,
+        "harness|mmlu_high_school_european_history|5": 1,
+        "harness|truthfulqa_mc|0": 0,
+        "harness|commongen_v2|2": 1
+    },
+    "config_general": {
+        "model_name": "BioMistral/BioMistral-7B",
+        "model_sha": "9a11e1ffa817c211cbb52ee1fb312dc6b61b40a5",
+        "model_dtype": "torch.float16",
+        "lighteval_sha": "",
+        "num_few_shot_default": 0,
+        "num_fewshot_seeds": 1,
+        "override_batch_size": 1,
+        "max_samples": null
+    }
+}
\ No newline at end of file
diff --git a/eval-results/EleutherAI/polyglot-ko-1.3b/EleutherAI_polyglot-ko-1.3b_result_2023-09-24 15_21_38.json b/eval-results/EleutherAI/polyglot-ko-1.3b/EleutherAI_polyglot-ko-1.3b_result_2023-09-24 15_21_38.json
new file mode 100644
index 0000000000000000000000000000000000000000..ac0ac041b644148e7ea7a60a1059234fe3564f1d
--- /dev/null
+++ b/eval-results/EleutherAI/polyglot-ko-1.3b/EleutherAI_polyglot-ko-1.3b_result_2023-09-24 15_21_38.json	
@@ -0,0 +1,450 @@
+{
+    "results": {
+        "daily": {
+          "daily": 11
+        },
+        "quarterly": {
+          "quarterly": 11
+        },
+        "harness|arc_challenge|25": {
+            "acc": 0.2235494880546075,
+            "acc_stderr": 0.012174896631202605,
+            "acc_norm": 0.2815699658703072,
+            "acc_norm_stderr": 0.013143376735009015
+        },
+        "harness|hellaswag|10": {
+            "acc": 0.3345947022505477,
+            "acc_stderr": 0.004708842600177431,
+            "acc_norm": 0.4135630352519418,
+            "acc_norm_stderr": 0.0049146550633294974
+        },
+        "harness|mmlu_world_religions|5": {
+            "acc": 0.27485380116959063,
+            "acc_stderr": 0.03424042924691585,
+            "acc_norm": 0.27485380116959063,
+            "acc_norm_stderr": 0.03424042924691585
+        },
+        "harness|mmlu_management|5": {
+            "acc": 0.27184466019417475,
+            "acc_stderr": 0.044052680241409216,
+            "acc_norm": 0.27184466019417475,
+            "acc_norm_stderr": 0.044052680241409216
+        },
+        "harness|mmlu_miscellaneous|5": {
+            "acc": 0.26947637292464877,
+            "acc_stderr": 0.015866243073215065,
+            "acc_norm": 0.26947637292464877,
+            "acc_norm_stderr": 0.015866243073215065
+        },
+        "harness|mmlu_anatomy|5": {
+            "acc": 0.26666666666666666,
+            "acc_stderr": 0.038201699145179055,
+            "acc_norm": 0.26666666666666666,
+            "acc_norm_stderr": 0.038201699145179055
+        },
+        "harness|mmlu_abstract_algebra|5": {
+            "acc": 0.3,
+            "acc_stderr": 0.046056618647183814,
+            "acc_norm": 0.3,
+            "acc_norm_stderr": 0.046056618647183814
+        },
+        "harness|mmlu_conceptual_physics|5": {
+            "acc": 0.2127659574468085,
+            "acc_stderr": 0.026754391348039783,
+            "acc_norm": 0.2127659574468085,
+            "acc_norm_stderr": 0.026754391348039783
+        },
+        "harness|mmlu_virology|5": {
+            "acc": 0.24096385542168675,
+            "acc_stderr": 0.033293941190735296,
+            "acc_norm": 0.24096385542168675,
+            "acc_norm_stderr": 0.033293941190735296
+        },
+        "harness|mmlu_philosophy|5": {
+            "acc": 0.2379421221864952,
+            "acc_stderr": 0.024185150647818707,
+            "acc_norm": 0.2379421221864952,
+            "acc_norm_stderr": 0.024185150647818707
+        },
+        "harness|mmlu_human_aging|5": {
+            "acc": 0.2825112107623318,
+            "acc_stderr": 0.030216831011508766,
+            "acc_norm": 0.2825112107623318,
+            "acc_norm_stderr": 0.030216831011508766
+        },
+        "harness|mmlu_human_sexuality|5": {
+            "acc": 0.21374045801526717,
+            "acc_stderr": 0.0359546161177469,
+            "acc_norm": 0.21374045801526717,
+            "acc_norm_stderr": 0.0359546161177469
+        },
+        "harness|mmlu_medical_genetics|5": {
+            "acc": 0.24,
+            "acc_stderr": 0.042923469599092816,
+            "acc_norm": 0.24,
+            "acc_norm_stderr": 0.042923469599092816
+        },
+        "harness|mmlu_high_school_geography|5": {
+            "acc": 0.2474747474747475,
+            "acc_stderr": 0.03074630074212451,
+            "acc_norm": 0.2474747474747475,
+            "acc_norm_stderr": 0.03074630074212451
+        },
+        "harness|mmlu_electrical_engineering|5": {
+            "acc": 0.22758620689655173,
+            "acc_stderr": 0.03493950380131184,
+            "acc_norm": 0.22758620689655173,
+            "acc_norm_stderr": 0.03493950380131184
+        },
+        "harness|mmlu_college_physics|5": {
+            "acc": 0.22549019607843138,
+            "acc_stderr": 0.041583075330832865,
+            "acc_norm": 0.22549019607843138,
+            "acc_norm_stderr": 0.041583075330832865
+        },
+        "harness|mmlu_high_school_microeconomics|5": {
+            "acc": 0.31512605042016806,
+            "acc_stderr": 0.030176808288974337,
+            "acc_norm": 0.31512605042016806,
+            "acc_norm_stderr": 0.030176808288974337
+        },
+        "harness|mmlu_high_school_macroeconomics|5": {
+            "acc": 0.2205128205128205,
+            "acc_stderr": 0.02102067268082791,
+            "acc_norm": 0.2205128205128205,
+            "acc_norm_stderr": 0.02102067268082791
+        },
+        "harness|mmlu_computer_security|5": {
+            "acc": 0.18,
+            "acc_stderr": 0.038612291966536955,
+            "acc_norm": 0.18,
+            "acc_norm_stderr": 0.038612291966536955
+        },
+        "harness|mmlu_global_facts|5": {
+            "acc": 0.31,
+            "acc_stderr": 0.04648231987117316,
+            "acc_norm": 0.31,
+            "acc_norm_stderr": 0.04648231987117316
+        },
+        "harness|mmlu_jurisprudence|5": {
+            "acc": 0.25,
+            "acc_stderr": 0.04186091791394607,
+            "acc_norm": 0.25,
+            "acc_norm_stderr": 0.04186091791394607
+        },
+        "harness|mmlu_high_school_chemistry|5": {
+            "acc": 0.2660098522167488,
+            "acc_stderr": 0.03108982600293752,
+            "acc_norm": 0.2660098522167488,
+            "acc_norm_stderr": 0.03108982600293752
+        },
+        "harness|mmlu_high_school_biology|5": {
+            "acc": 0.3,
+            "acc_stderr": 0.02606936229533513,
+            "acc_norm": 0.3,
+            "acc_norm_stderr": 0.02606936229533513
+        },
+        "harness|mmlu_marketing|5": {
+            "acc": 0.23076923076923078,
+            "acc_stderr": 0.027601921381417607,
+            "acc_norm": 0.23076923076923078,
+            "acc_norm_stderr": 0.027601921381417607
+        },
+        "harness|mmlu_clinical_knowledge|5": {
+            "acc": 0.25660377358490566,
+            "acc_stderr": 0.026880647889051968,
+            "acc_norm": 0.25660377358490566,
+            "acc_norm_stderr": 0.026880647889051968
+        },
+        "harness|mmlu_public_relations|5": {
+            "acc": 0.2545454545454545,
+            "acc_stderr": 0.04172343038705383,
+            "acc_norm": 0.2545454545454545,
+            "acc_norm_stderr": 0.04172343038705383
+        },
+        "harness|mmlu_high_school_mathematics|5": {
+            "acc": 0.2962962962962963,
+            "acc_stderr": 0.02784081149587194,
+            "acc_norm": 0.2962962962962963,
+            "acc_norm_stderr": 0.02784081149587194
+        },
+        "harness|mmlu_high_school_physics|5": {
+            "acc": 0.304635761589404,
+            "acc_stderr": 0.03757949922943342,
+            "acc_norm": 0.304635761589404,
+            "acc_norm_stderr": 0.03757949922943342
+        },
+        "harness|mmlu_sociology|5": {
+            "acc": 0.25870646766169153,
+            "acc_stderr": 0.03096590312357303,
+            "acc_norm": 0.25870646766169153,
+            "acc_norm_stderr": 0.03096590312357303
+        },
+        "harness|mmlu_college_medicine|5": {
+            "acc": 0.2254335260115607,
+            "acc_stderr": 0.03186209851641144,
+            "acc_norm": 0.2254335260115607,
+            "acc_norm_stderr": 0.03186209851641144
+        },
+        "harness|mmlu_elementary_mathematics|5": {
+            "acc": 0.2566137566137566,
+            "acc_stderr": 0.022494510767503154,
+            "acc_norm": 0.2566137566137566,
+            "acc_norm_stderr": 0.022494510767503154
+        },
+        "harness|mmlu_college_biology|5": {
+            "acc": 0.2638888888888889,
+            "acc_stderr": 0.03685651095897532,
+            "acc_norm": 0.2638888888888889,
+            "acc_norm_stderr": 0.03685651095897532
+        },
+        "harness|mmlu_college_chemistry|5": {
+            "acc": 0.23,
+            "acc_stderr": 0.04229525846816505,
+            "acc_norm": 0.23,
+            "acc_norm_stderr": 0.04229525846816505
+        },
+        "harness|mmlu_us_foreign_policy|5": {
+            "acc": 0.22,
+            "acc_stderr": 0.04163331998932269,
+            "acc_norm": 0.22,
+            "acc_norm_stderr": 0.04163331998932269
+        },
+        "harness|mmlu_moral_disputes|5": {
+            "acc": 0.24855491329479767,
+            "acc_stderr": 0.023267528432100174,
+            "acc_norm": 0.24855491329479767,
+            "acc_norm_stderr": 0.023267528432100174
+        },
+        "harness|mmlu_logical_fallacies|5": {
+            "acc": 0.31901840490797545,
+            "acc_stderr": 0.03661997551073836,
+            "acc_norm": 0.31901840490797545,
+            "acc_norm_stderr": 0.03661997551073836
+        },
+        "harness|mmlu_prehistory|5": {
+            "acc": 0.2623456790123457,
+            "acc_stderr": 0.024477222856135114,
+            "acc_norm": 0.2623456790123457,
+            "acc_norm_stderr": 0.024477222856135114
+        },
+        "harness|mmlu_college_mathematics|5": {
+            "acc": 0.25,
+            "acc_stderr": 0.04351941398892446,
+            "acc_norm": 0.25,
+            "acc_norm_stderr": 0.04351941398892446
+        },
+        "harness|mmlu_high_school_government_and_politics|5": {
+            "acc": 0.33678756476683935,
+            "acc_stderr": 0.03410780251836184,
+            "acc_norm": 0.33678756476683935,
+            "acc_norm_stderr": 0.03410780251836184
+        },
+        "harness|mmlu_econometrics|5": {
+            "acc": 0.20175438596491227,
+            "acc_stderr": 0.037752050135836386,
+            "acc_norm": 0.20175438596491227,
+            "acc_norm_stderr": 0.037752050135836386
+        },
+        "harness|mmlu_high_school_psychology|5": {
+            "acc": 0.24220183486238533,
+            "acc_stderr": 0.01836817630659862,
+            "acc_norm": 0.24220183486238533,
+            "acc_norm_stderr": 0.01836817630659862
+        },
+        "harness|mmlu_formal_logic|5": {
+            "acc": 0.23015873015873015,
+            "acc_stderr": 0.03764950879790606,
+            "acc_norm": 0.23015873015873015,
+            "acc_norm_stderr": 0.03764950879790606
+        },
+        "harness|mmlu_nutrition|5": {
+            "acc": 0.23529411764705882,
+            "acc_stderr": 0.024288619466046102,
+            "acc_norm": 0.23529411764705882,
+            "acc_norm_stderr": 0.024288619466046102
+        },
+        "harness|mmlu_business_ethics|5": {
+            "acc": 0.18,
+            "acc_stderr": 0.03861229196653695,
+            "acc_norm": 0.18,
+            "acc_norm_stderr": 0.03861229196653695
+        },
+        "harness|mmlu_international_law|5": {
+            "acc": 0.256198347107438,
+            "acc_stderr": 0.039849796533028704,
+            "acc_norm": 0.256198347107438,
+            "acc_norm_stderr": 0.039849796533028704
+        },
+        "harness|mmlu_astronomy|5": {
+            "acc": 0.21710526315789475,
+            "acc_stderr": 0.033550453048829226,
+            "acc_norm": 0.21710526315789475,
+            "acc_norm_stderr": 0.033550453048829226
+        },
+        "harness|mmlu_professional_psychology|5": {
+            "acc": 0.24019607843137256,
+            "acc_stderr": 0.01728276069516743,
+            "acc_norm": 0.24019607843137256,
+            "acc_norm_stderr": 0.01728276069516743
+        },
+        "harness|mmlu_professional_accounting|5": {
+            "acc": 0.2553191489361702,
+            "acc_stderr": 0.02601199293090201,
+            "acc_norm": 0.2553191489361702,
+            "acc_norm_stderr": 0.02601199293090201
+        },
+        "harness|mmlu_machine_learning|5": {
+            "acc": 0.21428571428571427,
+            "acc_stderr": 0.03894641120044793,
+            "acc_norm": 0.21428571428571427,
+            "acc_norm_stderr": 0.03894641120044793
+        },
+        "harness|mmlu_high_school_statistics|5": {
+            "acc": 0.46296296296296297,
+            "acc_stderr": 0.03400603625538272,
+            "acc_norm": 0.46296296296296297,
+            "acc_norm_stderr": 0.03400603625538272
+        },
+        "harness|mmlu_moral_scenarios|5": {
+            "acc": 0.24692737430167597,
+            "acc_stderr": 0.014422292204808852,
+            "acc_norm": 0.24692737430167597,
+            "acc_norm_stderr": 0.014422292204808852
+        },
+        "harness|mmlu_college_computer_science|5": {
+            "acc": 0.25,
+            "acc_stderr": 0.04351941398892446,
+            "acc_norm": 0.25,
+            "acc_norm_stderr": 0.04351941398892446
+        },
+        "harness|mmlu_high_school_computer_science|5": {
+            "acc": 0.3,
+            "acc_stderr": 0.046056618647183814,
+            "acc_norm": 0.3,
+            "acc_norm_stderr": 0.046056618647183814
+        },
+        "harness|mmlu_professional_medicine|5": {
+            "acc": 0.4411764705882353,
+            "acc_stderr": 0.030161911930767102,
+            "acc_norm": 0.4411764705882353,
+            "acc_norm_stderr": 0.030161911930767102
+        },
+        "harness|mmlu_security_studies|5": {
+            "acc": 0.3795918367346939,
+            "acc_stderr": 0.03106721126287249,
+            "acc_norm": 0.3795918367346939,
+            "acc_norm_stderr": 0.03106721126287249
+        },
+        "harness|mmlu_high_school_world_history|5": {
+            "acc": 0.2109704641350211,
+            "acc_stderr": 0.02655837250266192,
+            "acc_norm": 0.2109704641350211,
+            "acc_norm_stderr": 0.02655837250266192
+        },
+        "harness|mmlu_professional_law|5": {
+            "acc": 0.23468057366362452,
+            "acc_stderr": 0.010824026872449344,
+            "acc_norm": 0.23468057366362452,
+            "acc_norm_stderr": 0.010824026872449344
+        },
+        "harness|mmlu_high_school_us_history|5": {
+            "acc": 0.25,
+            "acc_stderr": 0.03039153369274154,
+            "acc_norm": 0.25,
+            "acc_norm_stderr": 0.03039153369274154
+        },
+        "harness|mmlu_high_school_european_history|5": {
+            "acc": 0.22424242424242424,
+            "acc_stderr": 0.03256866661681102,
+            "acc_norm": 0.22424242424242424,
+            "acc_norm_stderr": 0.03256866661681102
+        },
+        "harness|truthfulqa_mc|0": {
+            "mc1": 0.25091799265605874,
+            "mc1_stderr": 0.015176985027707682,
+            "mc2": 0.4116568832959107,
+            "mc2_stderr": 0.015044504977529799
+        },
+        "harness|commongen_v2|2": {
+            "acc": 0.27744982290436837,
+            "acc_stderr": 0.015393630236605975,
+            "acc_norm": 0.3400236127508855,
+            "acc_norm_stderr": 0.016286717220737674
+        }
+    },
+    "versions": {
+        "all": 0,
+        "harness|arc_challenge|25": 0,
+        "harness|hellaswag|10": 0,
+        "harness|mmlu_world_religions|5": 1,
+        "harness|mmlu_management|5": 1,
+        "harness|mmlu_miscellaneous|5": 1,
+        "harness|mmlu_anatomy|5": 1,
+        "harness|mmlu_abstract_algebra|5": 1,
+        "harness|mmlu_conceptual_physics|5": 1,
+        "harness|mmlu_virology|5": 1,
+        "harness|mmlu_philosophy|5": 1,
+        "harness|mmlu_human_aging|5": 1,
+        "harness|mmlu_human_sexuality|5": 1,
+        "harness|mmlu_medical_genetics|5": 1,
+        "harness|mmlu_high_school_geography|5": 1,
+        "harness|mmlu_electrical_engineering|5": 1,
+        "harness|mmlu_college_physics|5": 1,
+        "harness|mmlu_high_school_microeconomics|5": 1,
+        "harness|mmlu_high_school_macroeconomics|5": 1,
+        "harness|mmlu_computer_security|5": 1,
+        "harness|mmlu_global_facts|5": 1,
+        "harness|mmlu_jurisprudence|5": 1,
+        "harness|mmlu_high_school_chemistry|5": 1,
+        "harness|mmlu_high_school_biology|5": 1,
+        "harness|mmlu_marketing|5": 1,
+        "harness|mmlu_clinical_knowledge|5": 1,
+        "harness|mmlu_public_relations|5": 1,
+        "harness|mmlu_high_school_mathematics|5": 1,
+        "harness|mmlu_high_school_physics|5": 1,
+        "harness|mmlu_sociology|5": 1,
+        "harness|mmlu_college_medicine|5": 1,
+        "harness|mmlu_elementary_mathematics|5": 1,
+        "harness|mmlu_college_biology|5": 1,
+        "harness|mmlu_college_chemistry|5": 1,
+        "harness|mmlu_us_foreign_policy|5": 1,
+        "harness|mmlu_moral_disputes|5": 1,
+        "harness|mmlu_logical_fallacies|5": 1,
+        "harness|mmlu_prehistory|5": 1,
+        "harness|mmlu_college_mathematics|5": 1,
+        "harness|mmlu_high_school_government_and_politics|5": 1,
+        "harness|mmlu_econometrics|5": 1,
+        "harness|mmlu_high_school_psychology|5": 1,
+        "harness|mmlu_formal_logic|5": 1,
+        "harness|mmlu_nutrition|5": 1,
+        "harness|mmlu_business_ethics|5": 1,
+        "harness|mmlu_international_law|5": 1,
+        "harness|mmlu_astronomy|5": 1,
+        "harness|mmlu_professional_psychology|5": 1,
+        "harness|mmlu_professional_accounting|5": 1,
+        "harness|mmlu_machine_learning|5": 1,
+        "harness|mmlu_high_school_statistics|5": 1,
+        "harness|mmlu_moral_scenarios|5": 1,
+        "harness|mmlu_college_computer_science|5": 1,
+        "harness|mmlu_high_school_computer_science|5": 1,
+        "harness|mmlu_professional_medicine|5": 1,
+        "harness|mmlu_security_studies|5": 1,
+        "harness|mmlu_high_school_world_history|5": 1,
+        "harness|mmlu_professional_law|5": 1,
+        "harness|mmlu_high_school_us_history|5": 1,
+        "harness|mmlu_high_school_european_history|5": 1,
+        "harness|truthfulqa_mc|0": 0,
+        "harness|commongen_v2|2": 1
+    },
+    "config_general": {
+        "model_name": "EleutherAI/polyglot-ko-1.3b",
+        "model_sha": "557e162cf6e944fdbae05bab2e45d066a125eacb",
+        "model_dtype": "torch.float16",
+        "lighteval_sha": "",
+        "num_few_shot_default": 0,
+        "num_fewshot_seeds": 1,
+        "override_batch_size": 1,
+        "max_samples": null
+    }
+}
\ No newline at end of file
diff --git a/eval-results/HuggingFaceH4/.DS_Store b/eval-results/HuggingFaceH4/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..242908a5e43d09a037aba8dd0f0ec7a5e2c351cd
Binary files /dev/null and b/eval-results/HuggingFaceH4/.DS_Store differ
diff --git a/eval-results/HuggingFaceH4/zephyr-7b-beta/result.json b/eval-results/HuggingFaceH4/zephyr-7b-beta/result.json
new file mode 100644
index 0000000000000000000000000000000000000000..03ed7830c4c7cad4daa545fb305d005506cc24c3
--- /dev/null
+++ b/eval-results/HuggingFaceH4/zephyr-7b-beta/result.json
@@ -0,0 +1,450 @@
+{
+    "results": {
+        "daily": {
+          "daily": 8
+        },
+        "quarterly": {
+          "quarterly": 8
+        },
+        "harness|arc_challenge|25": {
+            "acc": 0.33532423208191126,
+            "acc_stderr": 0.01379618294778556,
+            "acc_norm": 0.3848122866894198,
+            "acc_norm_stderr": 0.014218371065251112
+        },
+        "harness|hellaswag|10": {
+            "acc": 0.35480979884485164,
+            "acc_stderr": 0.004774778180345192,
+            "acc_norm": 0.44911372236606256,
+            "acc_norm_stderr": 0.00496387293685794
+        },
+        "harness|mmlu_world_religions|5": {
+            "acc": 0.45614035087719296,
+            "acc_stderr": 0.03820042586602966,
+            "acc_norm": 0.45614035087719296,
+            "acc_norm_stderr": 0.03820042586602966
+        },
+        "harness|mmlu_management|5": {
+            "acc": 0.6019417475728155,
+            "acc_stderr": 0.04846748253977238,
+            "acc_norm": 0.6019417475728155,
+            "acc_norm_stderr": 0.04846748253977238
+        },
+        "harness|mmlu_miscellaneous|5": {
+            "acc": 0.41762452107279696,
+            "acc_stderr": 0.017635637326951534,
+            "acc_norm": 0.41762452107279696,
+            "acc_norm_stderr": 0.017635637326951534
+        },
+        "harness|mmlu_anatomy|5": {
+            "acc": 0.34074074074074073,
+            "acc_stderr": 0.040943762699967946,
+            "acc_norm": 0.34074074074074073,
+            "acc_norm_stderr": 0.040943762699967946
+        },
+        "harness|mmlu_abstract_algebra|5": {
+            "acc": 0.19,
+            "acc_stderr": 0.03942772444036623,
+            "acc_norm": 0.19,
+            "acc_norm_stderr": 0.03942772444036623
+        },
+        "harness|mmlu_conceptual_physics|5": {
+            "acc": 0.2978723404255319,
+            "acc_stderr": 0.029896145682095462,
+            "acc_norm": 0.2978723404255319,
+            "acc_norm_stderr": 0.029896145682095462
+        },
+        "harness|mmlu_virology|5": {
+            "acc": 0.3614457831325301,
+            "acc_stderr": 0.0374005938202932,
+            "acc_norm": 0.3614457831325301,
+            "acc_norm_stderr": 0.0374005938202932
+        },
+        "harness|mmlu_philosophy|5": {
+            "acc": 0.4758842443729904,
+            "acc_stderr": 0.028365041542564584,
+            "acc_norm": 0.4758842443729904,
+            "acc_norm_stderr": 0.028365041542564584
+        },
+        "harness|mmlu_human_aging|5": {
+            "acc": 0.3811659192825112,
+            "acc_stderr": 0.032596251184168284,
+            "acc_norm": 0.3811659192825112,
+            "acc_norm_stderr": 0.032596251184168284
+        },
+        "harness|mmlu_human_sexuality|5": {
+            "acc": 0.3511450381679389,
+            "acc_stderr": 0.04186445163013751,
+            "acc_norm": 0.3511450381679389,
+            "acc_norm_stderr": 0.04186445163013751
+        },
+        "harness|mmlu_medical_genetics|5": {
+            "acc": 0.27,
+            "acc_stderr": 0.0446196043338474,
+            "acc_norm": 0.27,
+            "acc_norm_stderr": 0.0446196043338474
+        },
+        "harness|mmlu_high_school_geography|5": {
+            "acc": 0.494949494949495,
+            "acc_stderr": 0.035621707606254015,
+            "acc_norm": 0.494949494949495,
+            "acc_norm_stderr": 0.035621707606254015
+        },
+        "harness|mmlu_electrical_engineering|5": {
+            "acc": 0.4,
+            "acc_stderr": 0.04082482904638628,
+            "acc_norm": 0.4,
+            "acc_norm_stderr": 0.04082482904638628
+        },
+        "harness|mmlu_college_physics|5": {
+            "acc": 0.3137254901960784,
+            "acc_stderr": 0.04617034827006717,
+            "acc_norm": 0.3137254901960784,
+            "acc_norm_stderr": 0.04617034827006717
+        },
+        "harness|mmlu_high_school_microeconomics|5": {
+            "acc": 0.4957983193277311,
+            "acc_stderr": 0.0324773433444811,
+            "acc_norm": 0.4957983193277311,
+            "acc_norm_stderr": 0.0324773433444811
+        },
+        "harness|mmlu_high_school_macroeconomics|5": {
+            "acc": 0.4256410256410256,
+            "acc_stderr": 0.025069094387296546,
+            "acc_norm": 0.4256410256410256,
+            "acc_norm_stderr": 0.025069094387296546
+        },
+        "harness|mmlu_computer_security|5": {
+            "acc": 0.59,
+            "acc_stderr": 0.049431107042371025,
+            "acc_norm": 0.59,
+            "acc_norm_stderr": 0.049431107042371025
+        },
+        "harness|mmlu_global_facts|5": {
+            "acc": 0.29,
+            "acc_stderr": 0.045604802157206845,
+            "acc_norm": 0.29,
+            "acc_norm_stderr": 0.045604802157206845
+        },
+        "harness|mmlu_jurisprudence|5": {
+            "acc": 0.4537037037037037,
+            "acc_stderr": 0.04812917324536821,
+            "acc_norm": 0.4537037037037037,
+            "acc_norm_stderr": 0.04812917324536821
+        },
+        "harness|mmlu_high_school_chemistry|5": {
+            "acc": 0.35467980295566504,
+            "acc_stderr": 0.03366124489051449,
+            "acc_norm": 0.35467980295566504,
+            "acc_norm_stderr": 0.03366124489051449
+        },
+        "harness|mmlu_high_school_biology|5": {
+            "acc": 0.4290322580645161,
+            "acc_stderr": 0.02815603653823321,
+            "acc_norm": 0.4290322580645161,
+            "acc_norm_stderr": 0.02815603653823321
+        },
+        "harness|mmlu_marketing|5": {
+            "acc": 0.6666666666666666,
+            "acc_stderr": 0.03088273697413865,
+            "acc_norm": 0.6666666666666666,
+            "acc_norm_stderr": 0.03088273697413865
+        },
+        "harness|mmlu_clinical_knowledge|5": {
+            "acc": 0.4188679245283019,
+            "acc_stderr": 0.03036505082911521,
+            "acc_norm": 0.4188679245283019,
+            "acc_norm_stderr": 0.03036505082911521
+        },
+        "harness|mmlu_public_relations|5": {
+            "acc": 0.42727272727272725,
+            "acc_stderr": 0.04738198703545483,
+            "acc_norm": 0.42727272727272725,
+            "acc_norm_stderr": 0.04738198703545483
+        },
+        "harness|mmlu_high_school_mathematics|5": {
+            "acc": 0.34814814814814815,
+            "acc_stderr": 0.029045600290616258,
+            "acc_norm": 0.34814814814814815,
+            "acc_norm_stderr": 0.029045600290616258
+        },
+        "harness|mmlu_high_school_physics|5": {
+            "acc": 0.2913907284768212,
+            "acc_stderr": 0.037101857261199946,
+            "acc_norm": 0.2913907284768212,
+            "acc_norm_stderr": 0.037101857261199946
+        },
+        "harness|mmlu_sociology|5": {
+            "acc": 0.5174129353233831,
+            "acc_stderr": 0.03533389234739245,
+            "acc_norm": 0.5174129353233831,
+            "acc_norm_stderr": 0.03533389234739245
+        },
+        "harness|mmlu_college_medicine|5": {
+            "acc": 0.37572254335260113,
+            "acc_stderr": 0.03692820767264867,
+            "acc_norm": 0.37572254335260113,
+            "acc_norm_stderr": 0.03692820767264867
+        },
+        "harness|mmlu_elementary_mathematics|5": {
+            "acc": 0.3492063492063492,
+            "acc_stderr": 0.024552292209342658,
+            "acc_norm": 0.3492063492063492,
+            "acc_norm_stderr": 0.024552292209342658
+        },
+        "harness|mmlu_college_biology|5": {
+            "acc": 0.3333333333333333,
+            "acc_stderr": 0.039420826399272135,
+            "acc_norm": 0.3333333333333333,
+            "acc_norm_stderr": 0.039420826399272135
+        },
+        "harness|mmlu_college_chemistry|5": {
+            "acc": 0.35,
+            "acc_stderr": 0.04793724854411019,
+            "acc_norm": 0.35,
+            "acc_norm_stderr": 0.04793724854411019
+        },
+        "harness|mmlu_us_foreign_policy|5": {
+            "acc": 0.49,
+            "acc_stderr": 0.05024183937956913,
+            "acc_norm": 0.49,
+            "acc_norm_stderr": 0.05024183937956913
+        },
+        "harness|mmlu_moral_disputes|5": {
+            "acc": 0.47398843930635837,
+            "acc_stderr": 0.026882643434022885,
+            "acc_norm": 0.47398843930635837,
+            "acc_norm_stderr": 0.026882643434022885
+        },
+        "harness|mmlu_logical_fallacies|5": {
+            "acc": 0.44171779141104295,
+            "acc_stderr": 0.039015918258361836,
+            "acc_norm": 0.44171779141104295,
+            "acc_norm_stderr": 0.039015918258361836
+        },
+        "harness|mmlu_prehistory|5": {
+            "acc": 0.42592592592592593,
+            "acc_stderr": 0.027513747284379424,
+            "acc_norm": 0.42592592592592593,
+            "acc_norm_stderr": 0.027513747284379424
+        },
+        "harness|mmlu_college_mathematics|5": {
+            "acc": 0.33,
+            "acc_stderr": 0.04725815626252606,
+            "acc_norm": 0.33,
+            "acc_norm_stderr": 0.04725815626252606
+        },
+        "harness|mmlu_high_school_government_and_politics|5": {
+            "acc": 0.5129533678756477,
+            "acc_stderr": 0.0360722806104775,
+            "acc_norm": 0.5129533678756477,
+            "acc_norm_stderr": 0.0360722806104775
+        },
+        "harness|mmlu_econometrics|5": {
+            "acc": 0.24561403508771928,
+            "acc_stderr": 0.0404933929774814,
+            "acc_norm": 0.24561403508771928,
+            "acc_norm_stderr": 0.0404933929774814
+        },
+        "harness|mmlu_high_school_psychology|5": {
+            "acc": 0.47155963302752296,
+            "acc_stderr": 0.02140261569734804,
+            "acc_norm": 0.47155963302752296,
+            "acc_norm_stderr": 0.02140261569734804
+        },
+        "harness|mmlu_formal_logic|5": {
+            "acc": 0.36507936507936506,
+            "acc_stderr": 0.04306241259127152,
+            "acc_norm": 0.36507936507936506,
+            "acc_norm_stderr": 0.04306241259127152
+        },
+        "harness|mmlu_nutrition|5": {
+            "acc": 0.4117647058823529,
+            "acc_stderr": 0.028180596328259297,
+            "acc_norm": 0.4117647058823529,
+            "acc_norm_stderr": 0.028180596328259297
+        },
+        "harness|mmlu_business_ethics|5": {
+            "acc": 0.44,
+            "acc_stderr": 0.04988876515698589,
+            "acc_norm": 0.44,
+            "acc_norm_stderr": 0.04988876515698589
+        },
+        "harness|mmlu_international_law|5": {
+            "acc": 0.5867768595041323,
+            "acc_stderr": 0.04495087843548408,
+            "acc_norm": 0.5867768595041323,
+            "acc_norm_stderr": 0.04495087843548408
+        },
+        "harness|mmlu_astronomy|5": {
+            "acc": 0.40131578947368424,
+            "acc_stderr": 0.03988903703336284,
+            "acc_norm": 0.40131578947368424,
+            "acc_norm_stderr": 0.03988903703336284
+        },
+        "harness|mmlu_professional_psychology|5": {
+            "acc": 0.32679738562091504,
+            "acc_stderr": 0.018975427920507215,
+            "acc_norm": 0.32679738562091504,
+            "acc_norm_stderr": 0.018975427920507215
+        },
+        "harness|mmlu_professional_accounting|5": {
+            "acc": 0.3333333333333333,
+            "acc_stderr": 0.02812163604063988,
+            "acc_norm": 0.3333333333333333,
+            "acc_norm_stderr": 0.02812163604063988
+        },
+        "harness|mmlu_machine_learning|5": {
+            "acc": 0.3392857142857143,
+            "acc_stderr": 0.04493949068613539,
+            "acc_norm": 0.3392857142857143,
+            "acc_norm_stderr": 0.04493949068613539
+        },
+        "harness|mmlu_high_school_statistics|5": {
+            "acc": 0.41203703703703703,
+            "acc_stderr": 0.03356787758160835,
+            "acc_norm": 0.41203703703703703,
+            "acc_norm_stderr": 0.03356787758160835
+        },
+        "harness|mmlu_moral_scenarios|5": {
+            "acc": 0.329608938547486,
+            "acc_stderr": 0.015721531075183884,
+            "acc_norm": 0.329608938547486,
+            "acc_norm_stderr": 0.015721531075183884
+        },
+        "harness|mmlu_college_computer_science|5": {
+            "acc": 0.39,
+            "acc_stderr": 0.04902071300001975,
+            "acc_norm": 0.39,
+            "acc_norm_stderr": 0.04902071300001975
+        },
+        "harness|mmlu_high_school_computer_science|5": {
+            "acc": 0.61,
+            "acc_stderr": 0.04902071300001975,
+            "acc_norm": 0.61,
+            "acc_norm_stderr": 0.04902071300001975
+        },
+        "harness|mmlu_professional_medicine|5": {
+            "acc": 0.375,
+            "acc_stderr": 0.029408372932278746,
+            "acc_norm": 0.375,
+            "acc_norm_stderr": 0.029408372932278746
+        },
+        "harness|mmlu_security_studies|5": {
+            "acc": 0.43673469387755104,
+            "acc_stderr": 0.03175195237583322,
+            "acc_norm": 0.43673469387755104,
+            "acc_norm_stderr": 0.03175195237583322
+        },
+        "harness|mmlu_high_school_world_history|5": {
+            "acc": 0.4810126582278481,
+            "acc_stderr": 0.03252375148090448,
+            "acc_norm": 0.4810126582278481,
+            "acc_norm_stderr": 0.03252375148090448
+        },
+        "harness|mmlu_professional_law|5": {
+            "acc": 0.29791395045632335,
+            "acc_stderr": 0.011680717340400059,
+            "acc_norm": 0.29791395045632335,
+            "acc_norm_stderr": 0.011680717340400059
+        },
+        "harness|mmlu_high_school_us_history|5": {
+            "acc": 0.29411764705882354,
+            "acc_stderr": 0.03198001660115072,
+            "acc_norm": 0.29411764705882354,
+            "acc_norm_stderr": 0.03198001660115072
+        },
+        "harness|mmlu_high_school_european_history|5": {
+            "acc": 0.30303030303030304,
+            "acc_stderr": 0.03588624800091707,
+            "acc_norm": 0.30303030303030304,
+            "acc_norm_stderr": 0.03588624800091707
+        },
+        "harness|truthfulqa_mc|0": {
+            "mc1": 0.3317013463892289,
+            "mc1_stderr": 0.01648214881024147,
+            "mc2": 0.5171680571717291,
+            "mc2_stderr": 0.01606077987901482
+        },
+        "harness|commongen_v2|2": {
+            "acc": 0.39787485242030696,
+            "acc_stderr": 0.01682795905473339,
+            "acc_norm": 0.4014167650531287,
+            "acc_norm_stderr": 0.01685290785872906
+        }
+    },
+    "versions": {
+        "all": 0,
+        "harness|arc_challenge|25": 0,
+        "harness|hellaswag|10": 0,
+        "harness|mmlu_world_religions|5": 1,
+        "harness|mmlu_management|5": 1,
+        "harness|mmlu_miscellaneous|5": 1,
+        "harness|mmlu_anatomy|5": 1,
+        "harness|mmlu_abstract_algebra|5": 1,
+        "harness|mmlu_conceptual_physics|5": 1,
+        "harness|mmlu_virology|5": 1,
+        "harness|mmlu_philosophy|5": 1,
+        "harness|mmlu_human_aging|5": 1,
+        "harness|mmlu_human_sexuality|5": 1,
+        "harness|mmlu_medical_genetics|5": 1,
+        "harness|mmlu_high_school_geography|5": 1,
+        "harness|mmlu_electrical_engineering|5": 1,
+        "harness|mmlu_college_physics|5": 1,
+        "harness|mmlu_high_school_microeconomics|5": 1,
+        "harness|mmlu_high_school_macroeconomics|5": 1,
+        "harness|mmlu_computer_security|5": 1,
+        "harness|mmlu_global_facts|5": 1,
+        "harness|mmlu_jurisprudence|5": 1,
+        "harness|mmlu_high_school_chemistry|5": 1,
+        "harness|mmlu_high_school_biology|5": 1,
+        "harness|mmlu_marketing|5": 1,
+        "harness|mmlu_clinical_knowledge|5": 1,
+        "harness|mmlu_public_relations|5": 1,
+        "harness|mmlu_high_school_mathematics|5": 1,
+        "harness|mmlu_high_school_physics|5": 1,
+        "harness|mmlu_sociology|5": 1,
+        "harness|mmlu_college_medicine|5": 1,
+        "harness|mmlu_elementary_mathematics|5": 1,
+        "harness|mmlu_college_biology|5": 1,
+        "harness|mmlu_college_chemistry|5": 1,
+        "harness|mmlu_us_foreign_policy|5": 1,
+        "harness|mmlu_moral_disputes|5": 1,
+        "harness|mmlu_logical_fallacies|5": 1,
+        "harness|mmlu_prehistory|5": 1,
+        "harness|mmlu_college_mathematics|5": 1,
+        "harness|mmlu_high_school_government_and_politics|5": 1,
+        "harness|mmlu_econometrics|5": 1,
+        "harness|mmlu_high_school_psychology|5": 1,
+        "harness|mmlu_formal_logic|5": 1,
+        "harness|mmlu_nutrition|5": 1,
+        "harness|mmlu_business_ethics|5": 1,
+        "harness|mmlu_international_law|5": 1,
+        "harness|mmlu_astronomy|5": 1,
+        "harness|mmlu_professional_psychology|5": 1,
+        "harness|mmlu_professional_accounting|5": 1,
+        "harness|mmlu_machine_learning|5": 1,
+        "harness|mmlu_high_school_statistics|5": 1,
+        "harness|mmlu_moral_scenarios|5": 1,
+        "harness|mmlu_college_computer_science|5": 1,
+        "harness|mmlu_high_school_computer_science|5": 1,
+        "harness|mmlu_professional_medicine|5": 1,
+        "harness|mmlu_security_studies|5": 1,
+        "harness|mmlu_high_school_world_history|5": 1,
+        "harness|mmlu_professional_law|5": 1,
+        "harness|mmlu_high_school_us_history|5": 1,
+        "harness|mmlu_high_school_european_history|5": 1,
+        "harness|truthfulqa_mc|0": 0,
+        "harness|commongen_v2|2": 1
+    },
+    "config_general": {
+        "model_name": "HuggingFaceH4/zephyr-7b-beta",
+        "model_sha": "3bac358730f8806e5c3dc7c7e19eb36e045bf720",
+        "model_dtype": "torch.float16",
+        "lighteval_sha": "",
+        "num_few_shot_default": 0,
+        "num_fewshot_seeds": 1,
+        "override_batch_size": 1,
+        "max_samples": null
+    }
+}
\ No newline at end of file
diff --git a/eval-results/README.md b/eval-results/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7b95401dc46245ac339fc25059d4a56d90b4cde5
--- /dev/null
+++ b/eval-results/README.md
@@ -0,0 +1,3 @@
+---
+license: apache-2.0
+---
diff --git a/eval-results/nlpai-lab/KULLM3/result.json b/eval-results/nlpai-lab/KULLM3/result.json
new file mode 100644
index 0000000000000000000000000000000000000000..be8b17b0dd2fc13e9bab3e5d24fec5086956344b
--- /dev/null
+++ b/eval-results/nlpai-lab/KULLM3/result.json
@@ -0,0 +1,450 @@
+{
+    "results": {
+        "daily": {
+          "daily": 6
+        },
+        "quarterly": {
+          "quarterly": 6
+        },
+        "harness|arc_challenge|25": {
+            "acc": 0.42918088737201365,
+            "acc_stderr": 0.014464085894870651,
+            "acc_norm": 0.46501706484641636,
+            "acc_norm_stderr": 0.014575583922019672
+        },
+        "harness|hellaswag|10": {
+            "acc": 0.445628360884286,
+            "acc_stderr": 0.004960191341430244,
+            "acc_norm": 0.589523999203346,
+            "acc_norm_stderr": 0.004909148239488273
+        },
+        "harness|mmlu_world_religions|5": {
+            "acc": 0.6432748538011696,
+            "acc_stderr": 0.03674013002860954,
+            "acc_norm": 0.6432748538011696,
+            "acc_norm_stderr": 0.03674013002860954
+        },
+        "harness|mmlu_management|5": {
+            "acc": 0.6116504854368932,
+            "acc_stderr": 0.04825729337356389,
+            "acc_norm": 0.6116504854368932,
+            "acc_norm_stderr": 0.04825729337356389
+        },
+        "harness|mmlu_miscellaneous|5": {
+            "acc": 0.6155810983397191,
+            "acc_stderr": 0.01739568874281962,
+            "acc_norm": 0.6155810983397191,
+            "acc_norm_stderr": 0.01739568874281962
+        },
+        "harness|mmlu_anatomy|5": {
+            "acc": 0.4962962962962963,
+            "acc_stderr": 0.04319223625811331,
+            "acc_norm": 0.4962962962962963,
+            "acc_norm_stderr": 0.04319223625811331
+        },
+        "harness|mmlu_abstract_algebra|5": {
+            "acc": 0.26,
+            "acc_stderr": 0.04408440022768077,
+            "acc_norm": 0.26,
+            "acc_norm_stderr": 0.04408440022768077
+        },
+        "harness|mmlu_conceptual_physics|5": {
+            "acc": 0.4553191489361702,
+            "acc_stderr": 0.03255525359340354,
+            "acc_norm": 0.4553191489361702,
+            "acc_norm_stderr": 0.03255525359340354
+        },
+        "harness|mmlu_virology|5": {
+            "acc": 0.5180722891566265,
+            "acc_stderr": 0.038899512528272166,
+            "acc_norm": 0.5180722891566265,
+            "acc_norm_stderr": 0.038899512528272166
+        },
+        "harness|mmlu_philosophy|5": {
+            "acc": 0.5755627009646302,
+            "acc_stderr": 0.028071928247946205,
+            "acc_norm": 0.5755627009646302,
+            "acc_norm_stderr": 0.028071928247946205
+        },
+        "harness|mmlu_human_aging|5": {
+            "acc": 0.5650224215246636,
+            "acc_stderr": 0.033272833702713445,
+            "acc_norm": 0.5650224215246636,
+            "acc_norm_stderr": 0.033272833702713445
+        },
+        "harness|mmlu_human_sexuality|5": {
+            "acc": 0.5877862595419847,
+            "acc_stderr": 0.04317171194870255,
+            "acc_norm": 0.5877862595419847,
+            "acc_norm_stderr": 0.04317171194870255
+        },
+        "harness|mmlu_medical_genetics|5": {
+            "acc": 0.5,
+            "acc_stderr": 0.050251890762960605,
+            "acc_norm": 0.5,
+            "acc_norm_stderr": 0.050251890762960605
+        },
+        "harness|mmlu_high_school_geography|5": {
+            "acc": 0.6515151515151515,
+            "acc_stderr": 0.033948539651564025,
+            "acc_norm": 0.6515151515151515,
+            "acc_norm_stderr": 0.033948539651564025
+        },
+        "harness|mmlu_electrical_engineering|5": {
+            "acc": 0.503448275862069,
+            "acc_stderr": 0.04166567577101579,
+            "acc_norm": 0.503448275862069,
+            "acc_norm_stderr": 0.04166567577101579
+        },
+        "harness|mmlu_college_physics|5": {
+            "acc": 0.2549019607843137,
+            "acc_stderr": 0.043364327079931785,
+            "acc_norm": 0.2549019607843137,
+            "acc_norm_stderr": 0.043364327079931785
+        },
+        "harness|mmlu_high_school_microeconomics|5": {
+            "acc": 0.5756302521008403,
+            "acc_stderr": 0.03210479051015776,
+            "acc_norm": 0.5756302521008403,
+            "acc_norm_stderr": 0.03210479051015776
+        },
+        "harness|mmlu_high_school_macroeconomics|5": {
+            "acc": 0.541025641025641,
+            "acc_stderr": 0.025265525491284295,
+            "acc_norm": 0.541025641025641,
+            "acc_norm_stderr": 0.025265525491284295
+        },
+        "harness|mmlu_computer_security|5": {
+            "acc": 0.54,
+            "acc_stderr": 0.05009082659620332,
+            "acc_norm": 0.54,
+            "acc_norm_stderr": 0.05009082659620332
+        },
+        "harness|mmlu_global_facts|5": {
+            "acc": 0.31,
+            "acc_stderr": 0.04648231987117316,
+            "acc_norm": 0.31,
+            "acc_norm_stderr": 0.04648231987117316
+        },
+        "harness|mmlu_jurisprudence|5": {
+            "acc": 0.5555555555555556,
+            "acc_stderr": 0.04803752235190192,
+            "acc_norm": 0.5555555555555556,
+            "acc_norm_stderr": 0.04803752235190192
+        },
+        "harness|mmlu_high_school_chemistry|5": {
+            "acc": 0.3842364532019704,
+            "acc_stderr": 0.0342239856565755,
+            "acc_norm": 0.3842364532019704,
+            "acc_norm_stderr": 0.0342239856565755
+        },
+        "harness|mmlu_high_school_biology|5": {
+            "acc": 0.5774193548387097,
+            "acc_stderr": 0.02810096472427264,
+            "acc_norm": 0.5774193548387097,
+            "acc_norm_stderr": 0.02810096472427264
+        },
+        "harness|mmlu_marketing|5": {
+            "acc": 0.7777777777777778,
+            "acc_stderr": 0.027236013946196673,
+            "acc_norm": 0.7777777777777778,
+            "acc_norm_stderr": 0.027236013946196673
+        },
+        "harness|mmlu_clinical_knowledge|5": {
+            "acc": 0.4981132075471698,
+            "acc_stderr": 0.030772653642075657,
+            "acc_norm": 0.4981132075471698,
+            "acc_norm_stderr": 0.030772653642075657
+        },
+        "harness|mmlu_public_relations|5": {
+            "acc": 0.5272727272727272,
+            "acc_stderr": 0.04782001791380061,
+            "acc_norm": 0.5272727272727272,
+            "acc_norm_stderr": 0.04782001791380061
+        },
+        "harness|mmlu_high_school_mathematics|5": {
+            "acc": 0.25555555555555554,
+            "acc_stderr": 0.026593939101844082,
+            "acc_norm": 0.25555555555555554,
+            "acc_norm_stderr": 0.026593939101844082
+        },
+        "harness|mmlu_high_school_physics|5": {
+            "acc": 0.33774834437086093,
+            "acc_stderr": 0.038615575462551684,
+            "acc_norm": 0.33774834437086093,
+            "acc_norm_stderr": 0.038615575462551684
+        },
+        "harness|mmlu_sociology|5": {
+            "acc": 0.7064676616915423,
+            "acc_stderr": 0.032200241045342054,
+            "acc_norm": 0.7064676616915423,
+            "acc_norm_stderr": 0.032200241045342054
+        },
+        "harness|mmlu_college_medicine|5": {
+            "acc": 0.4797687861271676,
+            "acc_stderr": 0.03809342081273958,
+            "acc_norm": 0.4797687861271676,
+            "acc_norm_stderr": 0.03809342081273958
+        },
+        "harness|mmlu_elementary_mathematics|5": {
+            "acc": 0.38095238095238093,
+            "acc_stderr": 0.025010749116137602,
+            "acc_norm": 0.38095238095238093,
+            "acc_norm_stderr": 0.025010749116137602
+        },
+        "harness|mmlu_college_biology|5": {
+            "acc": 0.4236111111111111,
+            "acc_stderr": 0.041321250197233685,
+            "acc_norm": 0.4236111111111111,
+            "acc_norm_stderr": 0.041321250197233685
+        },
+        "harness|mmlu_college_chemistry|5": {
+            "acc": 0.31,
+            "acc_stderr": 0.04648231987117316,
+            "acc_norm": 0.31,
+            "acc_norm_stderr": 0.04648231987117316
+        },
+        "harness|mmlu_us_foreign_policy|5": {
+            "acc": 0.71,
+            "acc_stderr": 0.04560480215720683,
+            "acc_norm": 0.71,
+            "acc_norm_stderr": 0.04560480215720683
+        },
+        "harness|mmlu_moral_disputes|5": {
+            "acc": 0.5751445086705202,
+            "acc_stderr": 0.026613350840261733,
+            "acc_norm": 0.5751445086705202,
+            "acc_norm_stderr": 0.026613350840261733
+        },
+        "harness|mmlu_logical_fallacies|5": {
+            "acc": 0.5030674846625767,
+            "acc_stderr": 0.03928297078179662,
+            "acc_norm": 0.5030674846625767,
+            "acc_norm_stderr": 0.03928297078179662
+        },
+        "harness|mmlu_prehistory|5": {
+            "acc": 0.5370370370370371,
+            "acc_stderr": 0.027744313443376536,
+            "acc_norm": 0.5370370370370371,
+            "acc_norm_stderr": 0.027744313443376536
+        },
+        "harness|mmlu_college_mathematics|5": {
+            "acc": 0.33,
+            "acc_stderr": 0.04725815626252606,
+            "acc_norm": 0.33,
+            "acc_norm_stderr": 0.04725815626252606
+        },
+        "harness|mmlu_high_school_government_and_politics|5": {
+            "acc": 0.6217616580310881,
+            "acc_stderr": 0.034998072761933376,
+            "acc_norm": 0.6217616580310881,
+            "acc_norm_stderr": 0.034998072761933376
+        },
+        "harness|mmlu_econometrics|5": {
+            "acc": 0.37719298245614036,
+            "acc_stderr": 0.04559522141958216,
+            "acc_norm": 0.37719298245614036,
+            "acc_norm_stderr": 0.04559522141958216
+        },
+        "harness|mmlu_high_school_psychology|5": {
+            "acc": 0.6385321100917432,
+            "acc_stderr": 0.02059808200993736,
+            "acc_norm": 0.6385321100917432,
+            "acc_norm_stderr": 0.02059808200993736
+        },
+        "harness|mmlu_formal_logic|5": {
+            "acc": 0.4126984126984127,
+            "acc_stderr": 0.04403438954768177,
+            "acc_norm": 0.4126984126984127,
+            "acc_norm_stderr": 0.04403438954768177
+        },
+        "harness|mmlu_nutrition|5": {
+            "acc": 0.5261437908496732,
+            "acc_stderr": 0.028590752958852387,
+            "acc_norm": 0.5261437908496732,
+            "acc_norm_stderr": 0.028590752958852387
+        },
+        "harness|mmlu_business_ethics|5": {
+            "acc": 0.57,
+            "acc_stderr": 0.049756985195624284,
+            "acc_norm": 0.57,
+            "acc_norm_stderr": 0.049756985195624284
+        },
+        "harness|mmlu_international_law|5": {
+            "acc": 0.7520661157024794,
+            "acc_stderr": 0.03941897526516304,
+            "acc_norm": 0.7520661157024794,
+            "acc_norm_stderr": 0.03941897526516304
+        },
+        "harness|mmlu_astronomy|5": {
+            "acc": 0.5789473684210527,
+            "acc_stderr": 0.040179012759817494,
+            "acc_norm": 0.5789473684210527,
+            "acc_norm_stderr": 0.040179012759817494
+        },
+        "harness|mmlu_professional_psychology|5": {
+            "acc": 0.4738562091503268,
+            "acc_stderr": 0.020200164564804588,
+            "acc_norm": 0.4738562091503268,
+            "acc_norm_stderr": 0.020200164564804588
+        },
+        "harness|mmlu_professional_accounting|5": {
+            "acc": 0.3404255319148936,
+            "acc_stderr": 0.02826765748265013,
+            "acc_norm": 0.3404255319148936,
+            "acc_norm_stderr": 0.02826765748265013
+        },
+        "harness|mmlu_machine_learning|5": {
+            "acc": 0.38392857142857145,
+            "acc_stderr": 0.046161430750285455,
+            "acc_norm": 0.38392857142857145,
+            "acc_norm_stderr": 0.046161430750285455
+        },
+        "harness|mmlu_high_school_statistics|5": {
+            "acc": 0.4675925925925926,
+            "acc_stderr": 0.03402801581358966,
+            "acc_norm": 0.4675925925925926,
+            "acc_norm_stderr": 0.03402801581358966
+        },
+        "harness|mmlu_moral_scenarios|5": {
+            "acc": 0.21675977653631284,
+            "acc_stderr": 0.013780598486443363,
+            "acc_norm": 0.21675977653631284,
+            "acc_norm_stderr": 0.013780598486443363
+        },
+        "harness|mmlu_college_computer_science|5": {
+            "acc": 0.39,
+            "acc_stderr": 0.04902071300001975,
+            "acc_norm": 0.39,
+            "acc_norm_stderr": 0.04902071300001975
+        },
+        "harness|mmlu_high_school_computer_science|5": {
+            "acc": 0.71,
+            "acc_stderr": 0.04560480215720684,
+            "acc_norm": 0.71,
+            "acc_norm_stderr": 0.04560480215720684
+        },
+        "harness|mmlu_professional_medicine|5": {
+            "acc": 0.4411764705882353,
+            "acc_stderr": 0.0301619119307671,
+            "acc_norm": 0.4411764705882353,
+            "acc_norm_stderr": 0.0301619119307671
+        },
+        "harness|mmlu_security_studies|5": {
+            "acc": 0.6285714285714286,
+            "acc_stderr": 0.03093285879278986,
+            "acc_norm": 0.6285714285714286,
+            "acc_norm_stderr": 0.03093285879278986
+        },
+        "harness|mmlu_high_school_world_history|5": {
+            "acc": 0.70042194092827,
+            "acc_stderr": 0.029818024749753095,
+            "acc_norm": 0.70042194092827,
+            "acc_norm_stderr": 0.029818024749753095
+        },
+        "harness|mmlu_professional_law|5": {
+            "acc": 0.378748370273794,
+            "acc_stderr": 0.012389052105003741,
+            "acc_norm": 0.378748370273794,
+            "acc_norm_stderr": 0.012389052105003741
+        },
+        "harness|mmlu_high_school_us_history|5": {
+            "acc": 0.6225490196078431,
+            "acc_stderr": 0.03402272044340703,
+            "acc_norm": 0.6225490196078431,
+            "acc_norm_stderr": 0.03402272044340703
+        },
+        "harness|mmlu_high_school_european_history|5": {
+            "acc": 0.6666666666666666,
+            "acc_stderr": 0.03681050869161549,
+            "acc_norm": 0.6666666666666666,
+            "acc_norm_stderr": 0.03681050869161549
+        },
+        "harness|truthfulqa_mc|0": {
+            "mc1": 0.33659730722154224,
+            "mc1_stderr": 0.016542412809494877,
+            "mc2": 0.49995145184296846,
+            "mc2_stderr": 0.015887726098900913
+        },
+        "harness|commongen_v2|2": {
+            "acc": 0.564344746162928,
+            "acc_stderr": 0.017047415229476316,
+            "acc_norm": 0.6068476977567887,
+            "acc_norm_stderr": 0.016793262801287068
+        }
+    },
+    "versions": {
+        "all": 0,
+        "harness|arc_challenge|25": 0,
+        "harness|hellaswag|10": 0,
+        "harness|mmlu_world_religions|5": 1,
+        "harness|mmlu_management|5": 1,
+        "harness|mmlu_miscellaneous|5": 1,
+        "harness|mmlu_anatomy|5": 1,
+        "harness|mmlu_abstract_algebra|5": 1,
+        "harness|mmlu_conceptual_physics|5": 1,
+        "harness|mmlu_virology|5": 1,
+        "harness|mmlu_philosophy|5": 1,
+        "harness|mmlu_human_aging|5": 1,
+        "harness|mmlu_human_sexuality|5": 1,
+        "harness|mmlu_medical_genetics|5": 1,
+        "harness|mmlu_high_school_geography|5": 1,
+        "harness|mmlu_electrical_engineering|5": 1,
+        "harness|mmlu_college_physics|5": 1,
+        "harness|mmlu_high_school_microeconomics|5": 1,
+        "harness|mmlu_high_school_macroeconomics|5": 1,
+        "harness|mmlu_computer_security|5": 1,
+        "harness|mmlu_global_facts|5": 1,
+        "harness|mmlu_jurisprudence|5": 1,
+        "harness|mmlu_high_school_chemistry|5": 1,
+        "harness|mmlu_high_school_biology|5": 1,
+        "harness|mmlu_marketing|5": 1,
+        "harness|mmlu_clinical_knowledge|5": 1,
+        "harness|mmlu_public_relations|5": 1,
+        "harness|mmlu_high_school_mathematics|5": 1,
+        "harness|mmlu_high_school_physics|5": 1,
+        "harness|mmlu_sociology|5": 1,
+        "harness|mmlu_college_medicine|5": 1,
+        "harness|mmlu_elementary_mathematics|5": 1,
+        "harness|mmlu_college_biology|5": 1,
+        "harness|mmlu_college_chemistry|5": 1,
+        "harness|mmlu_us_foreign_policy|5": 1,
+        "harness|mmlu_moral_disputes|5": 1,
+        "harness|mmlu_logical_fallacies|5": 1,
+        "harness|mmlu_prehistory|5": 1,
+        "harness|mmlu_college_mathematics|5": 1,
+        "harness|mmlu_high_school_government_and_politics|5": 1,
+        "harness|mmlu_econometrics|5": 1,
+        "harness|mmlu_high_school_psychology|5": 1,
+        "harness|mmlu_formal_logic|5": 1,
+        "harness|mmlu_nutrition|5": 1,
+        "harness|mmlu_business_ethics|5": 1,
+        "harness|mmlu_international_law|5": 1,
+        "harness|mmlu_astronomy|5": 1,
+        "harness|mmlu_professional_psychology|5": 1,
+        "harness|mmlu_professional_accounting|5": 1,
+        "harness|mmlu_machine_learning|5": 1,
+        "harness|mmlu_high_school_statistics|5": 1,
+        "harness|mmlu_moral_scenarios|5": 1,
+        "harness|mmlu_college_computer_science|5": 1,
+        "harness|mmlu_high_school_computer_science|5": 1,
+        "harness|mmlu_professional_medicine|5": 1,
+        "harness|mmlu_security_studies|5": 1,
+        "harness|mmlu_high_school_world_history|5": 1,
+        "harness|mmlu_professional_law|5": 1,
+        "harness|mmlu_high_school_us_history|5": 1,
+        "harness|mmlu_high_school_european_history|5": 1,
+        "harness|truthfulqa_mc|0": 0,
+        "harness|commongen_v2|2": 1
+    },
+    "config_general": {
+        "model_name": "nlpai-lab/KULLM3",
+        "model_sha": "5a6bcd0fc7f240460eb6d57016f7b4060bc1f43b",
+        "model_dtype": "torch.float16",
+        "lighteval_sha": "",
+        "num_few_shot_default": 0,
+        "num_fewshot_seeds": 1,
+        "override_batch_size": 1,
+        "max_samples": null
+    }
+}
\ No newline at end of file
diff --git a/eval-results/x2bee/POLAR-14B-DPO-v1.3/result.json b/eval-results/x2bee/POLAR-14B-DPO-v1.3/result.json
new file mode 100644
index 0000000000000000000000000000000000000000..48ca4423df5ee9731f60f12785501c7159e68cf6
--- /dev/null
+++ b/eval-results/x2bee/POLAR-14B-DPO-v1.3/result.json
@@ -0,0 +1,450 @@
+{
+    "results": {
+        "daily": {
+          "daily": 4
+        },
+        "quarterly": {
+          "quarterly": 4
+        },
+        "harness|arc_challenge|25": {
+            "acc": 0.7465870307167235,
+            "acc_stderr": 0.012710896778378604,
+            "acc_norm": 0.7807167235494881,
+            "acc_norm_stderr": 0.012091245787615728
+        },
+        "harness|hellaswag|10": {
+            "acc": 0.6385182234614618,
+            "acc_stderr": 0.004794478426382617,
+            "acc_norm": 0.7561242780322645,
+            "acc_norm_stderr": 0.004285410130466119
+        },
+        "harness|mmlu_world_religions|5": {
+            "acc": 0.6900584795321637,
+            "acc_stderr": 0.035469769593931624,
+            "acc_norm": 0.6900584795321637,
+            "acc_norm_stderr": 0.035469769593931624
+        },
+        "harness|mmlu_management|5": {
+            "acc": 0.6601941747572816,
+            "acc_stderr": 0.046897659372781335,
+            "acc_norm": 0.6601941747572816,
+            "acc_norm_stderr": 0.046897659372781335
+        },
+        "harness|mmlu_miscellaneous|5": {
+            "acc": 0.6845466155810983,
+            "acc_stderr": 0.016617501738763408,
+            "acc_norm": 0.6845466155810983,
+            "acc_norm_stderr": 0.016617501738763408
+        },
+        "harness|mmlu_anatomy|5": {
+            "acc": 0.48148148148148145,
+            "acc_stderr": 0.04316378599511324,
+            "acc_norm": 0.48148148148148145,
+            "acc_norm_stderr": 0.04316378599511324
+        },
+        "harness|mmlu_abstract_algebra|5": {
+            "acc": 0.33,
+            "acc_stderr": 0.047258156262526045,
+            "acc_norm": 0.33,
+            "acc_norm_stderr": 0.047258156262526045
+        },
+        "harness|mmlu_conceptual_physics|5": {
+            "acc": 0.46808510638297873,
+            "acc_stderr": 0.03261936918467383,
+            "acc_norm": 0.46808510638297873,
+            "acc_norm_stderr": 0.03261936918467383
+        },
+        "harness|mmlu_virology|5": {
+            "acc": 0.4759036144578313,
+            "acc_stderr": 0.03887971849597264,
+            "acc_norm": 0.4759036144578313,
+            "acc_norm_stderr": 0.03887971849597264
+        },
+        "harness|mmlu_philosophy|5": {
+            "acc": 0.6334405144694534,
+            "acc_stderr": 0.02736807824397163,
+            "acc_norm": 0.6334405144694534,
+            "acc_norm_stderr": 0.02736807824397163
+        },
+        "harness|mmlu_human_aging|5": {
+            "acc": 0.6681614349775785,
+            "acc_stderr": 0.03160295143776679,
+            "acc_norm": 0.6681614349775785,
+            "acc_norm_stderr": 0.03160295143776679
+        },
+        "harness|mmlu_human_sexuality|5": {
+            "acc": 0.6030534351145038,
+            "acc_stderr": 0.04291135671009224,
+            "acc_norm": 0.6030534351145038,
+            "acc_norm_stderr": 0.04291135671009224
+        },
+        "harness|mmlu_medical_genetics|5": {
+            "acc": 0.51,
+            "acc_stderr": 0.05024183937956911,
+            "acc_norm": 0.51,
+            "acc_norm_stderr": 0.05024183937956911
+        },
+        "harness|mmlu_high_school_geography|5": {
+            "acc": 0.7222222222222222,
+            "acc_stderr": 0.03191178226713547,
+            "acc_norm": 0.7222222222222222,
+            "acc_norm_stderr": 0.03191178226713547
+        },
+        "harness|mmlu_electrical_engineering|5": {
+            "acc": 0.47586206896551725,
+            "acc_stderr": 0.041618085035015295,
+            "acc_norm": 0.47586206896551725,
+            "acc_norm_stderr": 0.041618085035015295
+        },
+        "harness|mmlu_college_physics|5": {
+            "acc": 0.2549019607843137,
+            "acc_stderr": 0.04336432707993178,
+            "acc_norm": 0.2549019607843137,
+            "acc_norm_stderr": 0.04336432707993178
+        },
+        "harness|mmlu_high_school_microeconomics|5": {
+            "acc": 0.592436974789916,
+            "acc_stderr": 0.031918633744784666,
+            "acc_norm": 0.592436974789916,
+            "acc_norm_stderr": 0.031918633744784666
+        },
+        "harness|mmlu_high_school_macroeconomics|5": {
+            "acc": 0.5948717948717949,
+            "acc_stderr": 0.024890471769938142,
+            "acc_norm": 0.5948717948717949,
+            "acc_norm_stderr": 0.024890471769938142
+        },
+        "harness|mmlu_computer_security|5": {
+            "acc": 0.66,
+            "acc_stderr": 0.04760952285695237,
+            "acc_norm": 0.66,
+            "acc_norm_stderr": 0.04760952285695237
+        },
+        "harness|mmlu_global_facts|5": {
+            "acc": 0.27,
+            "acc_stderr": 0.04461960433384739,
+            "acc_norm": 0.27,
+            "acc_norm_stderr": 0.04461960433384739
+        },
+        "harness|mmlu_jurisprudence|5": {
+            "acc": 0.6388888888888888,
+            "acc_stderr": 0.04643454608906275,
+            "acc_norm": 0.6388888888888888,
+            "acc_norm_stderr": 0.04643454608906275
+        },
+        "harness|mmlu_high_school_chemistry|5": {
+            "acc": 0.4433497536945813,
+            "acc_stderr": 0.034953345821629345,
+            "acc_norm": 0.4433497536945813,
+            "acc_norm_stderr": 0.034953345821629345
+        },
+        "harness|mmlu_high_school_biology|5": {
+            "acc": 0.5806451612903226,
+            "acc_stderr": 0.028071588901091838,
+            "acc_norm": 0.5806451612903226,
+            "acc_norm_stderr": 0.028071588901091838
+        },
+        "harness|mmlu_marketing|5": {
+            "acc": 0.811965811965812,
+            "acc_stderr": 0.025598193686652254,
+            "acc_norm": 0.811965811965812,
+            "acc_norm_stderr": 0.025598193686652254
+        },
+        "harness|mmlu_clinical_knowledge|5": {
+            "acc": 0.5169811320754717,
+            "acc_stderr": 0.030755120364119898,
+            "acc_norm": 0.5169811320754717,
+            "acc_norm_stderr": 0.030755120364119898
+        },
+        "harness|mmlu_public_relations|5": {
+            "acc": 0.5818181818181818,
+            "acc_stderr": 0.04724577405731573,
+            "acc_norm": 0.5818181818181818,
+            "acc_norm_stderr": 0.04724577405731573
+        },
+        "harness|mmlu_high_school_mathematics|5": {
+            "acc": 0.3888888888888889,
+            "acc_stderr": 0.029723278961476664,
+            "acc_norm": 0.3888888888888889,
+            "acc_norm_stderr": 0.029723278961476664
+        },
+        "harness|mmlu_high_school_physics|5": {
+            "acc": 0.3708609271523179,
+            "acc_stderr": 0.03943966699183629,
+            "acc_norm": 0.3708609271523179,
+            "acc_norm_stderr": 0.03943966699183629
+        },
+        "harness|mmlu_sociology|5": {
+            "acc": 0.6666666666666666,
+            "acc_stderr": 0.033333333333333326,
+            "acc_norm": 0.6666666666666666,
+            "acc_norm_stderr": 0.033333333333333326
+        },
+        "harness|mmlu_college_medicine|5": {
+            "acc": 0.47398843930635837,
+            "acc_stderr": 0.038073017265045125,
+            "acc_norm": 0.47398843930635837,
+            "acc_norm_stderr": 0.038073017265045125
+        },
+        "harness|mmlu_elementary_mathematics|5": {
+            "acc": 0.42328042328042326,
+            "acc_stderr": 0.025446365634406793,
+            "acc_norm": 0.42328042328042326,
+            "acc_norm_stderr": 0.025446365634406793
+        },
+        "harness|mmlu_college_biology|5": {
+            "acc": 0.5625,
+            "acc_stderr": 0.04148415739394154,
+            "acc_norm": 0.5625,
+            "acc_norm_stderr": 0.04148415739394154
+        },
+        "harness|mmlu_college_chemistry|5": {
+            "acc": 0.39,
+            "acc_stderr": 0.04902071300001975,
+            "acc_norm": 0.39,
+            "acc_norm_stderr": 0.04902071300001975
+        },
+        "harness|mmlu_us_foreign_policy|5": {
+            "acc": 0.78,
+            "acc_stderr": 0.04163331998932263,
+            "acc_norm": 0.78,
+            "acc_norm_stderr": 0.04163331998932263
+        },
+        "harness|mmlu_moral_disputes|5": {
+            "acc": 0.5491329479768786,
+            "acc_stderr": 0.026788811931562767,
+            "acc_norm": 0.5491329479768786,
+            "acc_norm_stderr": 0.026788811931562767
+        },
+        "harness|mmlu_logical_fallacies|5": {
+            "acc": 0.6319018404907976,
+            "acc_stderr": 0.03789213935838396,
+            "acc_norm": 0.6319018404907976,
+            "acc_norm_stderr": 0.03789213935838396
+        },
+        "harness|mmlu_prehistory|5": {
+            "acc": 0.5925925925925926,
+            "acc_stderr": 0.02733954664066273,
+            "acc_norm": 0.5925925925925926,
+            "acc_norm_stderr": 0.02733954664066273
+        },
+        "harness|mmlu_college_mathematics|5": {
+            "acc": 0.4,
+            "acc_stderr": 0.049236596391733084,
+            "acc_norm": 0.4,
+            "acc_norm_stderr": 0.049236596391733084
+        },
+        "harness|mmlu_high_school_government_and_politics|5": {
+            "acc": 0.7668393782383419,
+            "acc_stderr": 0.03051611137147601,
+            "acc_norm": 0.7668393782383419,
+            "acc_norm_stderr": 0.03051611137147601
+        },
+        "harness|mmlu_econometrics|5": {
+            "acc": 0.4473684210526316,
+            "acc_stderr": 0.046774730044912,
+            "acc_norm": 0.4473684210526316,
+            "acc_norm_stderr": 0.046774730044912
+        },
+        "harness|mmlu_high_school_psychology|5": {
+            "acc": 0.726605504587156,
+            "acc_stderr": 0.01910929984609827,
+            "acc_norm": 0.726605504587156,
+            "acc_norm_stderr": 0.01910929984609827
+        },
+        "harness|mmlu_formal_logic|5": {
+            "acc": 0.3968253968253968,
+            "acc_stderr": 0.04375888492727061,
+            "acc_norm": 0.3968253968253968,
+            "acc_norm_stderr": 0.04375888492727061
+        },
+        "harness|mmlu_nutrition|5": {
+            "acc": 0.6078431372549019,
+            "acc_stderr": 0.027956046165424516,
+            "acc_norm": 0.6078431372549019,
+            "acc_norm_stderr": 0.027956046165424516
+        },
+        "harness|mmlu_business_ethics|5": {
+            "acc": 0.55,
+            "acc_stderr": 0.05,
+            "acc_norm": 0.55,
+            "acc_norm_stderr": 0.05
+        },
+        "harness|mmlu_international_law|5": {
+            "acc": 0.6942148760330579,
+            "acc_stderr": 0.04205953933884122,
+            "acc_norm": 0.6942148760330579,
+            "acc_norm_stderr": 0.04205953933884122
+        },
+        "harness|mmlu_astronomy|5": {
+            "acc": 0.618421052631579,
+            "acc_stderr": 0.03953173377749194,
+            "acc_norm": 0.618421052631579,
+            "acc_norm_stderr": 0.03953173377749194
+        },
+        "harness|mmlu_professional_psychology|5": {
+            "acc": 0.5669934640522876,
+            "acc_stderr": 0.02004544247332422,
+            "acc_norm": 0.5669934640522876,
+            "acc_norm_stderr": 0.02004544247332422
+        },
+        "harness|mmlu_professional_accounting|5": {
+            "acc": 0.4219858156028369,
+            "acc_stderr": 0.029462189233370586,
+            "acc_norm": 0.4219858156028369,
+            "acc_norm_stderr": 0.029462189233370586
+        },
+        "harness|mmlu_machine_learning|5": {
+            "acc": 0.5089285714285714,
+            "acc_stderr": 0.04745033255489123,
+            "acc_norm": 0.5089285714285714,
+            "acc_norm_stderr": 0.04745033255489123
+        },
+        "harness|mmlu_high_school_statistics|5": {
+            "acc": 0.4351851851851852,
+            "acc_stderr": 0.03381200005643526,
+            "acc_norm": 0.4351851851851852,
+            "acc_norm_stderr": 0.03381200005643526
+        },
+        "harness|mmlu_moral_scenarios|5": {
+            "acc": 0.3787709497206704,
+            "acc_stderr": 0.016223533510365117,
+            "acc_norm": 0.3787709497206704,
+            "acc_norm_stderr": 0.016223533510365117
+        },
+        "harness|mmlu_college_computer_science|5": {
+            "acc": 0.47,
+            "acc_stderr": 0.05016135580465919,
+            "acc_norm": 0.47,
+            "acc_norm_stderr": 0.05016135580465919
+        },
+        "harness|mmlu_high_school_computer_science|5": {
+            "acc": 0.66,
+            "acc_stderr": 0.04760952285695238,
+            "acc_norm": 0.66,
+            "acc_norm_stderr": 0.04760952285695238
+        },
+        "harness|mmlu_professional_medicine|5": {
+            "acc": 0.48161764705882354,
+            "acc_stderr": 0.03035230339535196,
+            "acc_norm": 0.48161764705882354,
+            "acc_norm_stderr": 0.03035230339535196
+        },
+        "harness|mmlu_security_studies|5": {
+            "acc": 0.6448979591836734,
+            "acc_stderr": 0.030635655150387634,
+            "acc_norm": 0.6448979591836734,
+            "acc_norm_stderr": 0.030635655150387634
+        },
+        "harness|mmlu_high_school_world_history|5": {
+            "acc": 0.729957805907173,
+            "acc_stderr": 0.028900721906293426,
+            "acc_norm": 0.729957805907173,
+            "acc_norm_stderr": 0.028900721906293426
+        },
+        "harness|mmlu_professional_law|5": {
+            "acc": 0.41460234680573665,
+            "acc_stderr": 0.012582597058908284,
+            "acc_norm": 0.41460234680573665,
+            "acc_norm_stderr": 0.012582597058908284
+        },
+        "harness|mmlu_high_school_us_history|5": {
+            "acc": 0.6421568627450981,
+            "acc_stderr": 0.03364487286088298,
+            "acc_norm": 0.6421568627450981,
+            "acc_norm_stderr": 0.03364487286088298
+        },
+        "harness|mmlu_high_school_european_history|5": {
+            "acc": 0.6181818181818182,
+            "acc_stderr": 0.03793713171165635,
+            "acc_norm": 0.6181818181818182,
+            "acc_norm_stderr": 0.03793713171165635
+        },
+        "harness|truthfulqa_mc|0": {
+            "mc1": 0.6328029375764994,
+            "mc1_stderr": 0.01687480500145318,
+            "mc2": 0.7522925779273922,
+            "mc2_stderr": 0.014568927682929578
+        },
+        "harness|commongen_v2|2": {
+            "acc": 0.45218417945690675,
+            "acc_stderr": 0.017111567130916785,
+            "acc_norm": 0.45454545454545453,
+            "acc_norm_stderr": 0.017119172208061504
+        }
+    },
+    "versions": {
+        "all": 0,
+        "harness|arc_challenge|25": 0,
+        "harness|hellaswag|10": 0,
+        "harness|mmlu_world_religions|5": 1,
+        "harness|mmlu_management|5": 1,
+        "harness|mmlu_miscellaneous|5": 1,
+        "harness|mmlu_anatomy|5": 1,
+        "harness|mmlu_abstract_algebra|5": 1,
+        "harness|mmlu_conceptual_physics|5": 1,
+        "harness|mmlu_virology|5": 1,
+        "harness|mmlu_philosophy|5": 1,
+        "harness|mmlu_human_aging|5": 1,
+        "harness|mmlu_human_sexuality|5": 1,
+        "harness|mmlu_medical_genetics|5": 1,
+        "harness|mmlu_high_school_geography|5": 1,
+        "harness|mmlu_electrical_engineering|5": 1,
+        "harness|mmlu_college_physics|5": 1,
+        "harness|mmlu_high_school_microeconomics|5": 1,
+        "harness|mmlu_high_school_macroeconomics|5": 1,
+        "harness|mmlu_computer_security|5": 1,
+        "harness|mmlu_global_facts|5": 1,
+        "harness|mmlu_jurisprudence|5": 1,
+        "harness|mmlu_high_school_chemistry|5": 1,
+        "harness|mmlu_high_school_biology|5": 1,
+        "harness|mmlu_marketing|5": 1,
+        "harness|mmlu_clinical_knowledge|5": 1,
+        "harness|mmlu_public_relations|5": 1,
+        "harness|mmlu_high_school_mathematics|5": 1,
+        "harness|mmlu_high_school_physics|5": 1,
+        "harness|mmlu_sociology|5": 1,
+        "harness|mmlu_college_medicine|5": 1,
+        "harness|mmlu_elementary_mathematics|5": 1,
+        "harness|mmlu_college_biology|5": 1,
+        "harness|mmlu_college_chemistry|5": 1,
+        "harness|mmlu_us_foreign_policy|5": 1,
+        "harness|mmlu_moral_disputes|5": 1,
+        "harness|mmlu_logical_fallacies|5": 1,
+        "harness|mmlu_prehistory|5": 1,
+        "harness|mmlu_college_mathematics|5": 1,
+        "harness|mmlu_high_school_government_and_politics|5": 1,
+        "harness|mmlu_econometrics|5": 1,
+        "harness|mmlu_high_school_psychology|5": 1,
+        "harness|mmlu_formal_logic|5": 1,
+        "harness|mmlu_nutrition|5": 1,
+        "harness|mmlu_business_ethics|5": 1,
+        "harness|mmlu_international_law|5": 1,
+        "harness|mmlu_astronomy|5": 1,
+        "harness|mmlu_professional_psychology|5": 1,
+        "harness|mmlu_professional_accounting|5": 1,
+        "harness|mmlu_machine_learning|5": 1,
+        "harness|mmlu_high_school_statistics|5": 1,
+        "harness|mmlu_moral_scenarios|5": 1,
+        "harness|mmlu_college_computer_science|5": 1,
+        "harness|mmlu_high_school_computer_science|5": 1,
+        "harness|mmlu_professional_medicine|5": 1,
+        "harness|mmlu_security_studies|5": 1,
+        "harness|mmlu_high_school_world_history|5": 1,
+        "harness|mmlu_professional_law|5": 1,
+        "harness|mmlu_high_school_us_history|5": 1,
+        "harness|mmlu_high_school_european_history|5": 1,
+        "harness|truthfulqa_mc|0": 0,
+        "harness|commongen_v2|2": 1
+    },
+    "config_general": {
+        "model_name": "x2bee/POLAR-14B-DPO-v1.3",
+        "model_sha": "337edbed4c86db2da27e3b0e07086134f8d27a09",
+        "model_dtype": "torch.float16",
+        "lighteval_sha": "",
+        "num_few_shot_default": 0,
+        "num_fewshot_seeds": 1,
+        "override_batch_size": 1,
+        "max_samples": null
+    }
+}
\ No newline at end of file
diff --git a/eval-results/x2bee/POLAR-14B-DPO-v1.4/result.json b/eval-results/x2bee/POLAR-14B-DPO-v1.4/result.json
new file mode 100644
index 0000000000000000000000000000000000000000..578117675d4a1e9aaa0b0ef30e18ebbfc8fd720d
--- /dev/null
+++ b/eval-results/x2bee/POLAR-14B-DPO-v1.4/result.json
@@ -0,0 +1,450 @@
+{
+    "results": {
+        "daily": {
+          "daily": 7
+        },
+        "quarterly": {
+          "quarterly": 7
+        },
+        "harness|arc_challenge|25": {
+            "acc": 0.7363481228668942,
+            "acc_stderr": 0.012875929151297058,
+            "acc_norm": 0.7491467576791809,
+            "acc_norm_stderr": 0.012668198621315433
+        },
+        "harness|hellaswag|10": {
+            "acc": 0.7228639713204541,
+            "acc_stderr": 0.004466695023677848,
+            "acc_norm": 0.7422824138617805,
+            "acc_norm_stderr": 0.004364838000335614
+        },
+        "harness|mmlu_world_religions|5": {
+            "acc": 0.6140350877192983,
+            "acc_stderr": 0.03733756969066164,
+            "acc_norm": 0.6140350877192983,
+            "acc_norm_stderr": 0.03733756969066164
+        },
+        "harness|mmlu_management|5": {
+            "acc": 0.6893203883495146,
+            "acc_stderr": 0.045821241601615506,
+            "acc_norm": 0.6893203883495146,
+            "acc_norm_stderr": 0.045821241601615506
+        },
+        "harness|mmlu_miscellaneous|5": {
+            "acc": 0.6526181353767561,
+            "acc_stderr": 0.017026671748655728,
+            "acc_norm": 0.6526181353767561,
+            "acc_norm_stderr": 0.017026671748655728
+        },
+        "harness|mmlu_anatomy|5": {
+            "acc": 0.5037037037037037,
+            "acc_stderr": 0.043192236258113324,
+            "acc_norm": 0.5037037037037037,
+            "acc_norm_stderr": 0.043192236258113324
+        },
+        "harness|mmlu_abstract_algebra|5": {
+            "acc": 0.37,
+            "acc_stderr": 0.048523658709391,
+            "acc_norm": 0.37,
+            "acc_norm_stderr": 0.048523658709391
+        },
+        "harness|mmlu_conceptual_physics|5": {
+            "acc": 0.451063829787234,
+            "acc_stderr": 0.032529096196131965,
+            "acc_norm": 0.451063829787234,
+            "acc_norm_stderr": 0.032529096196131965
+        },
+        "harness|mmlu_virology|5": {
+            "acc": 0.4939759036144578,
+            "acc_stderr": 0.03892212195333045,
+            "acc_norm": 0.4939759036144578,
+            "acc_norm_stderr": 0.03892212195333045
+        },
+        "harness|mmlu_philosophy|5": {
+            "acc": 0.5852090032154341,
+            "acc_stderr": 0.02798268045975956,
+            "acc_norm": 0.5852090032154341,
+            "acc_norm_stderr": 0.02798268045975956
+        },
+        "harness|mmlu_human_aging|5": {
+            "acc": 0.6412556053811659,
+            "acc_stderr": 0.032190792004199956,
+            "acc_norm": 0.6412556053811659,
+            "acc_norm_stderr": 0.032190792004199956
+        },
+        "harness|mmlu_human_sexuality|5": {
+            "acc": 0.5954198473282443,
+            "acc_stderr": 0.043046937953806645,
+            "acc_norm": 0.5954198473282443,
+            "acc_norm_stderr": 0.043046937953806645
+        },
+        "harness|mmlu_medical_genetics|5": {
+            "acc": 0.47,
+            "acc_stderr": 0.05016135580465919,
+            "acc_norm": 0.47,
+            "acc_norm_stderr": 0.05016135580465919
+        },
+        "harness|mmlu_high_school_geography|5": {
+            "acc": 0.6616161616161617,
+            "acc_stderr": 0.033711241426263014,
+            "acc_norm": 0.6616161616161617,
+            "acc_norm_stderr": 0.033711241426263014
+        },
+        "harness|mmlu_electrical_engineering|5": {
+            "acc": 0.4827586206896552,
+            "acc_stderr": 0.041641887201693775,
+            "acc_norm": 0.4827586206896552,
+            "acc_norm_stderr": 0.041641887201693775
+        },
+        "harness|mmlu_college_physics|5": {
+            "acc": 0.2549019607843137,
+            "acc_stderr": 0.04336432707993178,
+            "acc_norm": 0.2549019607843137,
+            "acc_norm_stderr": 0.04336432707993178
+        },
+        "harness|mmlu_high_school_microeconomics|5": {
+            "acc": 0.5882352941176471,
+            "acc_stderr": 0.031968769891957786,
+            "acc_norm": 0.5882352941176471,
+            "acc_norm_stderr": 0.031968769891957786
+        },
+        "harness|mmlu_high_school_macroeconomics|5": {
+            "acc": 0.6025641025641025,
+            "acc_stderr": 0.024811920017903836,
+            "acc_norm": 0.6025641025641025,
+            "acc_norm_stderr": 0.024811920017903836
+        },
+        "harness|mmlu_computer_security|5": {
+            "acc": 0.66,
+            "acc_stderr": 0.04760952285695237,
+            "acc_norm": 0.66,
+            "acc_norm_stderr": 0.04760952285695237
+        },
+        "harness|mmlu_global_facts|5": {
+            "acc": 0.35,
+            "acc_stderr": 0.047937248544110196,
+            "acc_norm": 0.35,
+            "acc_norm_stderr": 0.047937248544110196
+        },
+        "harness|mmlu_jurisprudence|5": {
+            "acc": 0.5925925925925926,
+            "acc_stderr": 0.04750077341199984,
+            "acc_norm": 0.5925925925925926,
+            "acc_norm_stderr": 0.04750077341199984
+        },
+        "harness|mmlu_high_school_chemistry|5": {
+            "acc": 0.43842364532019706,
+            "acc_stderr": 0.03491207857486518,
+            "acc_norm": 0.43842364532019706,
+            "acc_norm_stderr": 0.03491207857486518
+        },
+        "harness|mmlu_high_school_biology|5": {
+            "acc": 0.567741935483871,
+            "acc_stderr": 0.028181739720019413,
+            "acc_norm": 0.567741935483871,
+            "acc_norm_stderr": 0.028181739720019413
+        },
+        "harness|mmlu_marketing|5": {
+            "acc": 0.7948717948717948,
+            "acc_stderr": 0.026453508054040356,
+            "acc_norm": 0.7948717948717948,
+            "acc_norm_stderr": 0.026453508054040356
+        },
+        "harness|mmlu_clinical_knowledge|5": {
+            "acc": 0.5169811320754717,
+            "acc_stderr": 0.030755120364119905,
+            "acc_norm": 0.5169811320754717,
+            "acc_norm_stderr": 0.030755120364119905
+        },
+        "harness|mmlu_public_relations|5": {
+            "acc": 0.5727272727272728,
+            "acc_stderr": 0.047381987035454834,
+            "acc_norm": 0.5727272727272728,
+            "acc_norm_stderr": 0.047381987035454834
+        },
+        "harness|mmlu_high_school_mathematics|5": {
+            "acc": 0.3962962962962963,
+            "acc_stderr": 0.029822619458533997,
+            "acc_norm": 0.3962962962962963,
+            "acc_norm_stderr": 0.029822619458533997
+        },
+        "harness|mmlu_high_school_physics|5": {
+            "acc": 0.3708609271523179,
+            "acc_stderr": 0.03943966699183629,
+            "acc_norm": 0.3708609271523179,
+            "acc_norm_stderr": 0.03943966699183629
+        },
+        "harness|mmlu_sociology|5": {
+            "acc": 0.6766169154228856,
+            "acc_stderr": 0.03307615947979035,
+            "acc_norm": 0.6766169154228856,
+            "acc_norm_stderr": 0.03307615947979035
+        },
+        "harness|mmlu_college_medicine|5": {
+            "acc": 0.49710982658959535,
+            "acc_stderr": 0.038124005659748335,
+            "acc_norm": 0.49710982658959535,
+            "acc_norm_stderr": 0.038124005659748335
+        },
+        "harness|mmlu_elementary_mathematics|5": {
+            "acc": 0.42592592592592593,
+            "acc_stderr": 0.02546714904546955,
+            "acc_norm": 0.42592592592592593,
+            "acc_norm_stderr": 0.02546714904546955
+        },
+        "harness|mmlu_college_biology|5": {
+            "acc": 0.5555555555555556,
+            "acc_stderr": 0.04155319955593146,
+            "acc_norm": 0.5555555555555556,
+            "acc_norm_stderr": 0.04155319955593146
+        },
+        "harness|mmlu_college_chemistry|5": {
+            "acc": 0.4,
+            "acc_stderr": 0.04923659639173309,
+            "acc_norm": 0.4,
+            "acc_norm_stderr": 0.04923659639173309
+        },
+        "harness|mmlu_us_foreign_policy|5": {
+            "acc": 0.73,
+            "acc_stderr": 0.044619604333847394,
+            "acc_norm": 0.73,
+            "acc_norm_stderr": 0.044619604333847394
+        },
+        "harness|mmlu_moral_disputes|5": {
+            "acc": 0.5549132947976878,
+            "acc_stderr": 0.02675625512966377,
+            "acc_norm": 0.5549132947976878,
+            "acc_norm_stderr": 0.02675625512966377
+        },
+        "harness|mmlu_logical_fallacies|5": {
+            "acc": 0.588957055214724,
+            "acc_stderr": 0.038656978537853624,
+            "acc_norm": 0.588957055214724,
+            "acc_norm_stderr": 0.038656978537853624
+        },
+        "harness|mmlu_prehistory|5": {
+            "acc": 0.5771604938271605,
+            "acc_stderr": 0.027487472980871595,
+            "acc_norm": 0.5771604938271605,
+            "acc_norm_stderr": 0.027487472980871595
+        },
+        "harness|mmlu_college_mathematics|5": {
+            "acc": 0.4,
+            "acc_stderr": 0.049236596391733084,
+            "acc_norm": 0.4,
+            "acc_norm_stderr": 0.049236596391733084
+        },
+        "harness|mmlu_high_school_government_and_politics|5": {
+            "acc": 0.7305699481865285,
+            "acc_stderr": 0.032018671228777947,
+            "acc_norm": 0.7305699481865285,
+            "acc_norm_stderr": 0.032018671228777947
+        },
+        "harness|mmlu_econometrics|5": {
+            "acc": 0.42105263157894735,
+            "acc_stderr": 0.046446020912223177,
+            "acc_norm": 0.42105263157894735,
+            "acc_norm_stderr": 0.046446020912223177
+        },
+        "harness|mmlu_high_school_psychology|5": {
+            "acc": 0.7064220183486238,
+            "acc_stderr": 0.019525151122639663,
+            "acc_norm": 0.7064220183486238,
+            "acc_norm_stderr": 0.019525151122639663
+        },
+        "harness|mmlu_formal_logic|5": {
+            "acc": 0.3968253968253968,
+            "acc_stderr": 0.04375888492727061,
+            "acc_norm": 0.3968253968253968,
+            "acc_norm_stderr": 0.04375888492727061
+        },
+        "harness|mmlu_nutrition|5": {
+            "acc": 0.545751633986928,
+            "acc_stderr": 0.02850980780262659,
+            "acc_norm": 0.545751633986928,
+            "acc_norm_stderr": 0.02850980780262659
+        },
+        "harness|mmlu_business_ethics|5": {
+            "acc": 0.55,
+            "acc_stderr": 0.05000000000000001,
+            "acc_norm": 0.55,
+            "acc_norm_stderr": 0.05000000000000001
+        },
+        "harness|mmlu_international_law|5": {
+            "acc": 0.6859504132231405,
+            "acc_stderr": 0.04236964753041019,
+            "acc_norm": 0.6859504132231405,
+            "acc_norm_stderr": 0.04236964753041019
+        },
+        "harness|mmlu_astronomy|5": {
+            "acc": 0.6052631578947368,
+            "acc_stderr": 0.039777499346220734,
+            "acc_norm": 0.6052631578947368,
+            "acc_norm_stderr": 0.039777499346220734
+        },
+        "harness|mmlu_professional_psychology|5": {
+            "acc": 0.5392156862745098,
+            "acc_stderr": 0.02016552331390791,
+            "acc_norm": 0.5392156862745098,
+            "acc_norm_stderr": 0.02016552331390791
+        },
+        "harness|mmlu_professional_accounting|5": {
+            "acc": 0.35815602836879434,
+            "acc_stderr": 0.02860208586275942,
+            "acc_norm": 0.35815602836879434,
+            "acc_norm_stderr": 0.02860208586275942
+        },
+        "harness|mmlu_machine_learning|5": {
+            "acc": 0.4107142857142857,
+            "acc_stderr": 0.04669510663875192,
+            "acc_norm": 0.4107142857142857,
+            "acc_norm_stderr": 0.04669510663875192
+        },
+        "harness|mmlu_high_school_statistics|5": {
+            "acc": 0.44907407407407407,
+            "acc_stderr": 0.03392238405321617,
+            "acc_norm": 0.44907407407407407,
+            "acc_norm_stderr": 0.03392238405321617
+        },
+        "harness|mmlu_moral_scenarios|5": {
+            "acc": 0.3452513966480447,
+            "acc_stderr": 0.015901432608930354,
+            "acc_norm": 0.3452513966480447,
+            "acc_norm_stderr": 0.015901432608930354
+        },
+        "harness|mmlu_college_computer_science|5": {
+            "acc": 0.43,
+            "acc_stderr": 0.049756985195624284,
+            "acc_norm": 0.43,
+            "acc_norm_stderr": 0.049756985195624284
+        },
+        "harness|mmlu_high_school_computer_science|5": {
+            "acc": 0.66,
+            "acc_stderr": 0.04760952285695238,
+            "acc_norm": 0.66,
+            "acc_norm_stderr": 0.04760952285695238
+        },
+        "harness|mmlu_professional_medicine|5": {
+            "acc": 0.45588235294117646,
+            "acc_stderr": 0.030254372573976694,
+            "acc_norm": 0.45588235294117646,
+            "acc_norm_stderr": 0.030254372573976694
+        },
+        "harness|mmlu_security_studies|5": {
+            "acc": 0.6204081632653061,
+            "acc_stderr": 0.031067211262872457,
+            "acc_norm": 0.6204081632653061,
+            "acc_norm_stderr": 0.031067211262872457
+        },
+        "harness|mmlu_high_school_world_history|5": {
+            "acc": 0.6582278481012658,
+            "acc_stderr": 0.030874537537553617,
+            "acc_norm": 0.6582278481012658,
+            "acc_norm_stderr": 0.030874537537553617
+        },
+        "harness|mmlu_professional_law|5": {
+            "acc": 0.4152542372881356,
+            "acc_stderr": 0.012585471793400667,
+            "acc_norm": 0.4152542372881356,
+            "acc_norm_stderr": 0.012585471793400667
+        },
+        "harness|mmlu_high_school_us_history|5": {
+            "acc": 0.5343137254901961,
+            "acc_stderr": 0.03501038327635896,
+            "acc_norm": 0.5343137254901961,
+            "acc_norm_stderr": 0.03501038327635896
+        },
+        "harness|mmlu_high_school_european_history|5": {
+            "acc": 0.5454545454545454,
+            "acc_stderr": 0.038881769216741004,
+            "acc_norm": 0.5454545454545454,
+            "acc_norm_stderr": 0.038881769216741004
+        },
+        "harness|truthfulqa_mc|0": {
+            "mc1": 0.4663402692778458,
+            "mc1_stderr": 0.01746379386716811,
+            "mc2": NaN,
+            "mc2_stderr": NaN
+        },
+        "harness|commongen_v2|2": {
+            "acc": 0.44037780401416765,
+            "acc_stderr": 0.01706769977431298,
+            "acc_norm": 0.44510035419126326,
+            "acc_norm_stderr": 0.01708641743100547
+        }
+    },
+    "versions": {
+        "all": 0,
+        "harness|arc_challenge|25": 0,
+        "harness|hellaswag|10": 0,
+        "harness|mmlu_world_religions|5": 1,
+        "harness|mmlu_management|5": 1,
+        "harness|mmlu_miscellaneous|5": 1,
+        "harness|mmlu_anatomy|5": 1,
+        "harness|mmlu_abstract_algebra|5": 1,
+        "harness|mmlu_conceptual_physics|5": 1,
+        "harness|mmlu_virology|5": 1,
+        "harness|mmlu_philosophy|5": 1,
+        "harness|mmlu_human_aging|5": 1,
+        "harness|mmlu_human_sexuality|5": 1,
+        "harness|mmlu_medical_genetics|5": 1,
+        "harness|mmlu_high_school_geography|5": 1,
+        "harness|mmlu_electrical_engineering|5": 1,
+        "harness|mmlu_college_physics|5": 1,
+        "harness|mmlu_high_school_microeconomics|5": 1,
+        "harness|mmlu_high_school_macroeconomics|5": 1,
+        "harness|mmlu_computer_security|5": 1,
+        "harness|mmlu_global_facts|5": 1,
+        "harness|mmlu_jurisprudence|5": 1,
+        "harness|mmlu_high_school_chemistry|5": 1,
+        "harness|mmlu_high_school_biology|5": 1,
+        "harness|mmlu_marketing|5": 1,
+        "harness|mmlu_clinical_knowledge|5": 1,
+        "harness|mmlu_public_relations|5": 1,
+        "harness|mmlu_high_school_mathematics|5": 1,
+        "harness|mmlu_high_school_physics|5": 1,
+        "harness|mmlu_sociology|5": 1,
+        "harness|mmlu_college_medicine|5": 1,
+        "harness|mmlu_elementary_mathematics|5": 1,
+        "harness|mmlu_college_biology|5": 1,
+        "harness|mmlu_college_chemistry|5": 1,
+        "harness|mmlu_us_foreign_policy|5": 1,
+        "harness|mmlu_moral_disputes|5": 1,
+        "harness|mmlu_logical_fallacies|5": 1,
+        "harness|mmlu_prehistory|5": 1,
+        "harness|mmlu_college_mathematics|5": 1,
+        "harness|mmlu_high_school_government_and_politics|5": 1,
+        "harness|mmlu_econometrics|5": 1,
+        "harness|mmlu_high_school_psychology|5": 1,
+        "harness|mmlu_formal_logic|5": 1,
+        "harness|mmlu_nutrition|5": 1,
+        "harness|mmlu_business_ethics|5": 1,
+        "harness|mmlu_international_law|5": 1,
+        "harness|mmlu_astronomy|5": 1,
+        "harness|mmlu_professional_psychology|5": 1,
+        "harness|mmlu_professional_accounting|5": 1,
+        "harness|mmlu_machine_learning|5": 1,
+        "harness|mmlu_high_school_statistics|5": 1,
+        "harness|mmlu_moral_scenarios|5": 1,
+        "harness|mmlu_college_computer_science|5": 1,
+        "harness|mmlu_high_school_computer_science|5": 1,
+        "harness|mmlu_professional_medicine|5": 1,
+        "harness|mmlu_security_studies|5": 1,
+        "harness|mmlu_high_school_world_history|5": 1,
+        "harness|mmlu_professional_law|5": 1,
+        "harness|mmlu_high_school_us_history|5": 1,
+        "harness|mmlu_high_school_european_history|5": 1,
+        "harness|truthfulqa_mc|0": 0,
+        "harness|commongen_v2|2": 1
+    },
+    "config_general": {
+        "model_name": "x2bee/POLAR-14B-DPO-v1.4",
+        "model_sha": "a6e64075fafaa3d5e393ff89c3cb26f9615e6de9",
+        "model_dtype": "torch.float16",
+        "lighteval_sha": "",
+        "num_few_shot_default": 0,
+        "num_fewshot_seeds": 1,
+        "override_batch_size": 1,
+        "max_samples": null
+    }
+}
\ No newline at end of file
diff --git a/eval-results/x2bee/POLAR-14B-HES-DPO-v1.5/result.json b/eval-results/x2bee/POLAR-14B-HES-DPO-v1.5/result.json
new file mode 100644
index 0000000000000000000000000000000000000000..8347643b533d653103e30f952f4381c7265e6bee
--- /dev/null
+++ b/eval-results/x2bee/POLAR-14B-HES-DPO-v1.5/result.json
@@ -0,0 +1,450 @@
+{
+    "results": {
+        "daily": {
+          "daily": 5
+        },
+        "quarterly": {
+          "quarterly": 5
+        },
+        "harness|arc_challenge|25": {
+            "acc": 0.6638225255972696,
+            "acc_stderr": 0.013804855026205756,
+            "acc_norm": 0.7278156996587031,
+            "acc_norm_stderr": 0.013006600406423709
+        },
+        "harness|hellaswag|10": {
+            "acc": 0.45648277235610435,
+            "acc_stderr": 0.004970846697552306,
+            "acc_norm": 0.6349332802230632,
+            "acc_norm_stderr": 0.004804649197163697
+        },
+        "harness|mmlu_world_religions|5": {
+            "acc": 0.7309941520467836,
+            "acc_stderr": 0.0340105262010409,
+            "acc_norm": 0.7309941520467836,
+            "acc_norm_stderr": 0.0340105262010409
+        },
+        "harness|mmlu_management|5": {
+            "acc": 0.7766990291262136,
+            "acc_stderr": 0.04123553189891431,
+            "acc_norm": 0.7766990291262136,
+            "acc_norm_stderr": 0.04123553189891431
+        },
+        "harness|mmlu_miscellaneous|5": {
+            "acc": 0.7343550446998723,
+            "acc_stderr": 0.01579430248788872,
+            "acc_norm": 0.7343550446998723,
+            "acc_norm_stderr": 0.01579430248788872
+        },
+        "harness|mmlu_anatomy|5": {
+            "acc": 0.45185185185185184,
+            "acc_stderr": 0.04299268905480863,
+            "acc_norm": 0.45185185185185184,
+            "acc_norm_stderr": 0.04299268905480863
+        },
+        "harness|mmlu_abstract_algebra|5": {
+            "acc": 0.35,
+            "acc_stderr": 0.04793724854411019,
+            "acc_norm": 0.35,
+            "acc_norm_stderr": 0.04793724854411019
+        },
+        "harness|mmlu_conceptual_physics|5": {
+            "acc": 0.5276595744680851,
+            "acc_stderr": 0.03263597118409769,
+            "acc_norm": 0.5276595744680851,
+            "acc_norm_stderr": 0.03263597118409769
+        },
+        "harness|mmlu_virology|5": {
+            "acc": 0.4759036144578313,
+            "acc_stderr": 0.03887971849597264,
+            "acc_norm": 0.4759036144578313,
+            "acc_norm_stderr": 0.03887971849597264
+        },
+        "harness|mmlu_philosophy|5": {
+            "acc": 0.6559485530546624,
+            "acc_stderr": 0.026981478043648043,
+            "acc_norm": 0.6559485530546624,
+            "acc_norm_stderr": 0.026981478043648043
+        },
+        "harness|mmlu_human_aging|5": {
+            "acc": 0.6412556053811659,
+            "acc_stderr": 0.032190792004199956,
+            "acc_norm": 0.6412556053811659,
+            "acc_norm_stderr": 0.032190792004199956
+        },
+        "harness|mmlu_human_sexuality|5": {
+            "acc": 0.648854961832061,
+            "acc_stderr": 0.04186445163013751,
+            "acc_norm": 0.648854961832061,
+            "acc_norm_stderr": 0.04186445163013751
+        },
+        "harness|mmlu_medical_genetics|5": {
+            "acc": 0.54,
+            "acc_stderr": 0.05009082659620333,
+            "acc_norm": 0.54,
+            "acc_norm_stderr": 0.05009082659620333
+        },
+        "harness|mmlu_high_school_geography|5": {
+            "acc": 0.7777777777777778,
+            "acc_stderr": 0.029620227874790465,
+            "acc_norm": 0.7777777777777778,
+            "acc_norm_stderr": 0.029620227874790465
+        },
+        "harness|mmlu_electrical_engineering|5": {
+            "acc": 0.5103448275862069,
+            "acc_stderr": 0.04165774775728762,
+            "acc_norm": 0.5103448275862069,
+            "acc_norm_stderr": 0.04165774775728762
+        },
+        "harness|mmlu_college_physics|5": {
+            "acc": 0.3627450980392157,
+            "acc_stderr": 0.04784060704105655,
+            "acc_norm": 0.3627450980392157,
+            "acc_norm_stderr": 0.04784060704105655
+        },
+        "harness|mmlu_high_school_microeconomics|5": {
+            "acc": 0.6680672268907563,
+            "acc_stderr": 0.03058869701378364,
+            "acc_norm": 0.6680672268907563,
+            "acc_norm_stderr": 0.03058869701378364
+        },
+        "harness|mmlu_high_school_macroeconomics|5": {
+            "acc": 0.6384615384615384,
+            "acc_stderr": 0.024359581465397,
+            "acc_norm": 0.6384615384615384,
+            "acc_norm_stderr": 0.024359581465397
+        },
+        "harness|mmlu_computer_security|5": {
+            "acc": 0.65,
+            "acc_stderr": 0.04793724854411021,
+            "acc_norm": 0.65,
+            "acc_norm_stderr": 0.04793724854411021
+        },
+        "harness|mmlu_global_facts|5": {
+            "acc": 0.37,
+            "acc_stderr": 0.04852365870939099,
+            "acc_norm": 0.37,
+            "acc_norm_stderr": 0.04852365870939099
+        },
+        "harness|mmlu_jurisprudence|5": {
+            "acc": 0.6851851851851852,
+            "acc_stderr": 0.04489931073591312,
+            "acc_norm": 0.6851851851851852,
+            "acc_norm_stderr": 0.04489931073591312
+        },
+        "harness|mmlu_high_school_chemistry|5": {
+            "acc": 0.46798029556650245,
+            "acc_stderr": 0.035107665979592154,
+            "acc_norm": 0.46798029556650245,
+            "acc_norm_stderr": 0.035107665979592154
+        },
+        "harness|mmlu_high_school_biology|5": {
+            "acc": 0.6548387096774193,
+            "acc_stderr": 0.02704574657353432,
+            "acc_norm": 0.6548387096774193,
+            "acc_norm_stderr": 0.02704574657353432
+        },
+        "harness|mmlu_marketing|5": {
+            "acc": 0.8162393162393162,
+            "acc_stderr": 0.025372139671722933,
+            "acc_norm": 0.8162393162393162,
+            "acc_norm_stderr": 0.025372139671722933
+        },
+        "harness|mmlu_clinical_knowledge|5": {
+            "acc": 0.5773584905660377,
+            "acc_stderr": 0.03040233144576954,
+            "acc_norm": 0.5773584905660377,
+            "acc_norm_stderr": 0.03040233144576954
+        },
+        "harness|mmlu_public_relations|5": {
+            "acc": 0.6454545454545455,
+            "acc_stderr": 0.045820048415054174,
+            "acc_norm": 0.6454545454545455,
+            "acc_norm_stderr": 0.045820048415054174
+        },
+        "harness|mmlu_high_school_mathematics|5": {
+            "acc": 0.4074074074074074,
+            "acc_stderr": 0.029958249250082118,
+            "acc_norm": 0.4074074074074074,
+            "acc_norm_stderr": 0.029958249250082118
+        },
+        "harness|mmlu_high_school_physics|5": {
+            "acc": 0.3509933774834437,
+            "acc_stderr": 0.03896981964257375,
+            "acc_norm": 0.3509933774834437,
+            "acc_norm_stderr": 0.03896981964257375
+        },
+        "harness|mmlu_sociology|5": {
+            "acc": 0.7263681592039801,
+            "acc_stderr": 0.03152439186555404,
+            "acc_norm": 0.7263681592039801,
+            "acc_norm_stderr": 0.03152439186555404
+        },
+        "harness|mmlu_college_medicine|5": {
+            "acc": 0.5375722543352601,
+            "acc_stderr": 0.0380168510452446,
+            "acc_norm": 0.5375722543352601,
+            "acc_norm_stderr": 0.0380168510452446
+        },
+        "harness|mmlu_elementary_mathematics|5": {
+            "acc": 0.4365079365079365,
+            "acc_stderr": 0.025542846817400496,
+            "acc_norm": 0.4365079365079365,
+            "acc_norm_stderr": 0.025542846817400496
+        },
+        "harness|mmlu_college_biology|5": {
+            "acc": 0.5694444444444444,
+            "acc_stderr": 0.04140685639111503,
+            "acc_norm": 0.5694444444444444,
+            "acc_norm_stderr": 0.04140685639111503
+        },
+        "harness|mmlu_college_chemistry|5": {
+            "acc": 0.43,
+            "acc_stderr": 0.049756985195624284,
+            "acc_norm": 0.43,
+            "acc_norm_stderr": 0.049756985195624284
+        },
+        "harness|mmlu_us_foreign_policy|5": {
+            "acc": 0.78,
+            "acc_stderr": 0.04163331998932263,
+            "acc_norm": 0.78,
+            "acc_norm_stderr": 0.04163331998932263
+        },
+        "harness|mmlu_moral_disputes|5": {
+            "acc": 0.6098265895953757,
+            "acc_stderr": 0.026261677607806642,
+            "acc_norm": 0.6098265895953757,
+            "acc_norm_stderr": 0.026261677607806642
+        },
+        "harness|mmlu_logical_fallacies|5": {
+            "acc": 0.656441717791411,
+            "acc_stderr": 0.03731133519673893,
+            "acc_norm": 0.656441717791411,
+            "acc_norm_stderr": 0.03731133519673893
+        },
+        "harness|mmlu_prehistory|5": {
+            "acc": 0.6574074074074074,
+            "acc_stderr": 0.02640614597362568,
+            "acc_norm": 0.6574074074074074,
+            "acc_norm_stderr": 0.02640614597362568
+        },
+        "harness|mmlu_college_mathematics|5": {
+            "acc": 0.38,
+            "acc_stderr": 0.04878317312145632,
+            "acc_norm": 0.38,
+            "acc_norm_stderr": 0.04878317312145632
+        },
+        "harness|mmlu_high_school_government_and_politics|5": {
+            "acc": 0.7668393782383419,
+            "acc_stderr": 0.03051611137147601,
+            "acc_norm": 0.7668393782383419,
+            "acc_norm_stderr": 0.03051611137147601
+        },
+        "harness|mmlu_econometrics|5": {
+            "acc": 0.45614035087719296,
+            "acc_stderr": 0.046854730419077895,
+            "acc_norm": 0.45614035087719296,
+            "acc_norm_stderr": 0.046854730419077895
+        },
+        "harness|mmlu_high_school_psychology|5": {
+            "acc": 0.7853211009174312,
+            "acc_stderr": 0.017604304149256494,
+            "acc_norm": 0.7853211009174312,
+            "acc_norm_stderr": 0.017604304149256494
+        },
+        "harness|mmlu_formal_logic|5": {
+            "acc": 0.4523809523809524,
+            "acc_stderr": 0.044518079590553275,
+            "acc_norm": 0.4523809523809524,
+            "acc_norm_stderr": 0.044518079590553275
+        },
+        "harness|mmlu_nutrition|5": {
+            "acc": 0.6405228758169934,
+            "acc_stderr": 0.027475969910660952,
+            "acc_norm": 0.6405228758169934,
+            "acc_norm_stderr": 0.027475969910660952
+        },
+        "harness|mmlu_business_ethics|5": {
+            "acc": 0.66,
+            "acc_stderr": 0.04760952285695237,
+            "acc_norm": 0.66,
+            "acc_norm_stderr": 0.04760952285695237
+        },
+        "harness|mmlu_international_law|5": {
+            "acc": 0.7933884297520661,
+            "acc_stderr": 0.03695980128098824,
+            "acc_norm": 0.7933884297520661,
+            "acc_norm_stderr": 0.03695980128098824
+        },
+        "harness|mmlu_astronomy|5": {
+            "acc": 0.6842105263157895,
+            "acc_stderr": 0.0378272898086547,
+            "acc_norm": 0.6842105263157895,
+            "acc_norm_stderr": 0.0378272898086547
+        },
+        "harness|mmlu_professional_psychology|5": {
+            "acc": 0.5964052287581699,
+            "acc_stderr": 0.019848280168401164,
+            "acc_norm": 0.5964052287581699,
+            "acc_norm_stderr": 0.019848280168401164
+        },
+        "harness|mmlu_professional_accounting|5": {
+            "acc": 0.4397163120567376,
+            "acc_stderr": 0.02960991207559411,
+            "acc_norm": 0.4397163120567376,
+            "acc_norm_stderr": 0.02960991207559411
+        },
+        "harness|mmlu_machine_learning|5": {
+            "acc": 0.39285714285714285,
+            "acc_stderr": 0.04635550135609976,
+            "acc_norm": 0.39285714285714285,
+            "acc_norm_stderr": 0.04635550135609976
+        },
+        "harness|mmlu_high_school_statistics|5": {
+            "acc": 0.5787037037037037,
+            "acc_stderr": 0.03367462138896078,
+            "acc_norm": 0.5787037037037037,
+            "acc_norm_stderr": 0.03367462138896078
+        },
+        "harness|mmlu_moral_scenarios|5": {
+            "acc": 0.264804469273743,
+            "acc_stderr": 0.01475690648326066,
+            "acc_norm": 0.264804469273743,
+            "acc_norm_stderr": 0.01475690648326066
+        },
+        "harness|mmlu_college_computer_science|5": {
+            "acc": 0.52,
+            "acc_stderr": 0.050211673156867795,
+            "acc_norm": 0.52,
+            "acc_norm_stderr": 0.050211673156867795
+        },
+        "harness|mmlu_high_school_computer_science|5": {
+            "acc": 0.7,
+            "acc_stderr": 0.04605661864718381,
+            "acc_norm": 0.7,
+            "acc_norm_stderr": 0.04605661864718381
+        },
+        "harness|mmlu_professional_medicine|5": {
+            "acc": 0.5588235294117647,
+            "acc_stderr": 0.03016191193076711,
+            "acc_norm": 0.5588235294117647,
+            "acc_norm_stderr": 0.03016191193076711
+        },
+        "harness|mmlu_security_studies|5": {
+            "acc": 0.6448979591836734,
+            "acc_stderr": 0.030635655150387634,
+            "acc_norm": 0.6448979591836734,
+            "acc_norm_stderr": 0.030635655150387634
+        },
+        "harness|mmlu_high_school_world_history|5": {
+            "acc": 0.7426160337552743,
+            "acc_stderr": 0.028458820991460302,
+            "acc_norm": 0.7426160337552743,
+            "acc_norm_stderr": 0.028458820991460302
+        },
+        "harness|mmlu_professional_law|5": {
+            "acc": 0.44654498044328556,
+            "acc_stderr": 0.012697046024399661,
+            "acc_norm": 0.44654498044328556,
+            "acc_norm_stderr": 0.012697046024399661
+        },
+        "harness|mmlu_high_school_us_history|5": {
+            "acc": 0.6225490196078431,
+            "acc_stderr": 0.03402272044340703,
+            "acc_norm": 0.6225490196078431,
+            "acc_norm_stderr": 0.03402272044340703
+        },
+        "harness|mmlu_high_school_european_history|5": {
+            "acc": 0.6303030303030303,
+            "acc_stderr": 0.03769430314512569,
+            "acc_norm": 0.6303030303030303,
+            "acc_norm_stderr": 0.03769430314512569
+        },
+        "harness|truthfulqa_mc|0": {
+            "mc1": 0.6634026927784578,
+            "mc1_stderr": 0.0165424128094949,
+            "mc2": 0.7515104740134964,
+            "mc2_stderr": 0.014200593490054807
+        },
+        "harness|commongen_v2|2": {
+            "acc": 0.5147579693034239,
+            "acc_stderr": 0.01718286443499856,
+            "acc_norm": 0.526564344746163,
+            "acc_norm_stderr": 0.017166075717577747
+        }
+    },
+    "versions": {
+        "all": 0,
+        "harness|arc_challenge|25": 0,
+        "harness|hellaswag|10": 0,
+        "harness|mmlu_world_religions|5": 1,
+        "harness|mmlu_management|5": 1,
+        "harness|mmlu_miscellaneous|5": 1,
+        "harness|mmlu_anatomy|5": 1,
+        "harness|mmlu_abstract_algebra|5": 1,
+        "harness|mmlu_conceptual_physics|5": 1,
+        "harness|mmlu_virology|5": 1,
+        "harness|mmlu_philosophy|5": 1,
+        "harness|mmlu_human_aging|5": 1,
+        "harness|mmlu_human_sexuality|5": 1,
+        "harness|mmlu_medical_genetics|5": 1,
+        "harness|mmlu_high_school_geography|5": 1,
+        "harness|mmlu_electrical_engineering|5": 1,
+        "harness|mmlu_college_physics|5": 1,
+        "harness|mmlu_high_school_microeconomics|5": 1,
+        "harness|mmlu_high_school_macroeconomics|5": 1,
+        "harness|mmlu_computer_security|5": 1,
+        "harness|mmlu_global_facts|5": 1,
+        "harness|mmlu_jurisprudence|5": 1,
+        "harness|mmlu_high_school_chemistry|5": 1,
+        "harness|mmlu_high_school_biology|5": 1,
+        "harness|mmlu_marketing|5": 1,
+        "harness|mmlu_clinical_knowledge|5": 1,
+        "harness|mmlu_public_relations|5": 1,
+        "harness|mmlu_high_school_mathematics|5": 1,
+        "harness|mmlu_high_school_physics|5": 1,
+        "harness|mmlu_sociology|5": 1,
+        "harness|mmlu_college_medicine|5": 1,
+        "harness|mmlu_elementary_mathematics|5": 1,
+        "harness|mmlu_college_biology|5": 1,
+        "harness|mmlu_college_chemistry|5": 1,
+        "harness|mmlu_us_foreign_policy|5": 1,
+        "harness|mmlu_moral_disputes|5": 1,
+        "harness|mmlu_logical_fallacies|5": 1,
+        "harness|mmlu_prehistory|5": 1,
+        "harness|mmlu_college_mathematics|5": 1,
+        "harness|mmlu_high_school_government_and_politics|5": 1,
+        "harness|mmlu_econometrics|5": 1,
+        "harness|mmlu_high_school_psychology|5": 1,
+        "harness|mmlu_formal_logic|5": 1,
+        "harness|mmlu_nutrition|5": 1,
+        "harness|mmlu_business_ethics|5": 1,
+        "harness|mmlu_international_law|5": 1,
+        "harness|mmlu_astronomy|5": 1,
+        "harness|mmlu_professional_psychology|5": 1,
+        "harness|mmlu_professional_accounting|5": 1,
+        "harness|mmlu_machine_learning|5": 1,
+        "harness|mmlu_high_school_statistics|5": 1,
+        "harness|mmlu_moral_scenarios|5": 1,
+        "harness|mmlu_college_computer_science|5": 1,
+        "harness|mmlu_high_school_computer_science|5": 1,
+        "harness|mmlu_professional_medicine|5": 1,
+        "harness|mmlu_security_studies|5": 1,
+        "harness|mmlu_high_school_world_history|5": 1,
+        "harness|mmlu_professional_law|5": 1,
+        "harness|mmlu_high_school_us_history|5": 1,
+        "harness|mmlu_high_school_european_history|5": 1,
+        "harness|truthfulqa_mc|0": 0,
+        "harness|commongen_v2|2": 1
+    },
+    "config_general": {
+        "model_name": "x2bee/POLAR-14B-HES-DPO-v1.5",
+        "model_sha": "f0bc8e2566ba28c8232d7c690098e634ea894e8d",
+        "model_dtype": "torch.float16",
+        "lighteval_sha": "",
+        "num_few_shot_default": 0,
+        "num_fewshot_seeds": 1,
+        "override_batch_size": 1,
+        "max_samples": null
+    }
+}
\ No newline at end of file
diff --git a/eval-results/x2bee/POLAR-14B-SON-SFT-v0.1/result.json b/eval-results/x2bee/POLAR-14B-SON-SFT-v0.1/result.json
new file mode 100644
index 0000000000000000000000000000000000000000..79c8abce9a4e3db7fb8fa93c24ae76400cea8f9c
--- /dev/null
+++ b/eval-results/x2bee/POLAR-14B-SON-SFT-v0.1/result.json
@@ -0,0 +1,450 @@
+{
+    "results": {
+        "daily": {
+          "daily": 3
+        },
+        "quarterly": {
+          "quarterly": 3
+        },
+        "harness|arc_challenge|25": {
+            "acc": 0.6646757679180887,
+            "acc_stderr": 0.013796182947785564,
+            "acc_norm": 0.7244027303754266,
+            "acc_norm_stderr": 0.01305716965576184
+        },
+        "harness|hellaswag|10": {
+            "acc": 0.46036646086436966,
+            "acc_stderr": 0.004974080638364276,
+            "acc_norm": 0.6195976897032464,
+            "acc_norm_stderr": 0.004844935327599196
+        },
+        "harness|mmlu_world_religions|5": {
+            "acc": 0.7602339181286549,
+            "acc_stderr": 0.03274485211946956,
+            "acc_norm": 0.7602339181286549,
+            "acc_norm_stderr": 0.03274485211946956
+        },
+        "harness|mmlu_management|5": {
+            "acc": 0.7766990291262136,
+            "acc_stderr": 0.04123553189891431,
+            "acc_norm": 0.7766990291262136,
+            "acc_norm_stderr": 0.04123553189891431
+        },
+        "harness|mmlu_miscellaneous|5": {
+            "acc": 0.7381864623243933,
+            "acc_stderr": 0.01572083867844526,
+            "acc_norm": 0.7381864623243933,
+            "acc_norm_stderr": 0.01572083867844526
+        },
+        "harness|mmlu_anatomy|5": {
+            "acc": 0.5037037037037037,
+            "acc_stderr": 0.04319223625811331,
+            "acc_norm": 0.5037037037037037,
+            "acc_norm_stderr": 0.04319223625811331
+        },
+        "harness|mmlu_abstract_algebra|5": {
+            "acc": 0.35,
+            "acc_stderr": 0.04793724854411019,
+            "acc_norm": 0.35,
+            "acc_norm_stderr": 0.04793724854411019
+        },
+        "harness|mmlu_conceptual_physics|5": {
+            "acc": 0.5404255319148936,
+            "acc_stderr": 0.032579014820998335,
+            "acc_norm": 0.5404255319148936,
+            "acc_norm_stderr": 0.032579014820998335
+        },
+        "harness|mmlu_virology|5": {
+            "acc": 0.5180722891566265,
+            "acc_stderr": 0.038899512528272166,
+            "acc_norm": 0.5180722891566265,
+            "acc_norm_stderr": 0.038899512528272166
+        },
+        "harness|mmlu_philosophy|5": {
+            "acc": 0.6559485530546624,
+            "acc_stderr": 0.026981478043648043,
+            "acc_norm": 0.6559485530546624,
+            "acc_norm_stderr": 0.026981478043648043
+        },
+        "harness|mmlu_human_aging|5": {
+            "acc": 0.6591928251121076,
+            "acc_stderr": 0.0318114974705536,
+            "acc_norm": 0.6591928251121076,
+            "acc_norm_stderr": 0.0318114974705536
+        },
+        "harness|mmlu_human_sexuality|5": {
+            "acc": 0.6564885496183206,
+            "acc_stderr": 0.041649760719448786,
+            "acc_norm": 0.6564885496183206,
+            "acc_norm_stderr": 0.041649760719448786
+        },
+        "harness|mmlu_medical_genetics|5": {
+            "acc": 0.52,
+            "acc_stderr": 0.050211673156867795,
+            "acc_norm": 0.52,
+            "acc_norm_stderr": 0.050211673156867795
+        },
+        "harness|mmlu_high_school_geography|5": {
+            "acc": 0.7575757575757576,
+            "acc_stderr": 0.030532892233932036,
+            "acc_norm": 0.7575757575757576,
+            "acc_norm_stderr": 0.030532892233932036
+        },
+        "harness|mmlu_electrical_engineering|5": {
+            "acc": 0.5586206896551724,
+            "acc_stderr": 0.04137931034482757,
+            "acc_norm": 0.5586206896551724,
+            "acc_norm_stderr": 0.04137931034482757
+        },
+        "harness|mmlu_college_physics|5": {
+            "acc": 0.3137254901960784,
+            "acc_stderr": 0.04617034827006717,
+            "acc_norm": 0.3137254901960784,
+            "acc_norm_stderr": 0.04617034827006717
+        },
+        "harness|mmlu_high_school_microeconomics|5": {
+            "acc": 0.6512605042016807,
+            "acc_stderr": 0.03095663632856655,
+            "acc_norm": 0.6512605042016807,
+            "acc_norm_stderr": 0.03095663632856655
+        },
+        "harness|mmlu_high_school_macroeconomics|5": {
+            "acc": 0.6230769230769231,
+            "acc_stderr": 0.024570975364225995,
+            "acc_norm": 0.6230769230769231,
+            "acc_norm_stderr": 0.024570975364225995
+        },
+        "harness|mmlu_computer_security|5": {
+            "acc": 0.73,
+            "acc_stderr": 0.04461960433384739,
+            "acc_norm": 0.73,
+            "acc_norm_stderr": 0.04461960433384739
+        },
+        "harness|mmlu_global_facts|5": {
+            "acc": 0.32,
+            "acc_stderr": 0.04688261722621505,
+            "acc_norm": 0.32,
+            "acc_norm_stderr": 0.04688261722621505
+        },
+        "harness|mmlu_jurisprudence|5": {
+            "acc": 0.7037037037037037,
+            "acc_stderr": 0.04414343666854933,
+            "acc_norm": 0.7037037037037037,
+            "acc_norm_stderr": 0.04414343666854933
+        },
+        "harness|mmlu_high_school_chemistry|5": {
+            "acc": 0.4630541871921182,
+            "acc_stderr": 0.035083705204426656,
+            "acc_norm": 0.4630541871921182,
+            "acc_norm_stderr": 0.035083705204426656
+        },
+        "harness|mmlu_high_school_biology|5": {
+            "acc": 0.603225806451613,
+            "acc_stderr": 0.027831231605767944,
+            "acc_norm": 0.603225806451613,
+            "acc_norm_stderr": 0.027831231605767944
+        },
+        "harness|mmlu_marketing|5": {
+            "acc": 0.8205128205128205,
+            "acc_stderr": 0.025140935950335435,
+            "acc_norm": 0.8205128205128205,
+            "acc_norm_stderr": 0.025140935950335435
+        },
+        "harness|mmlu_clinical_knowledge|5": {
+            "acc": 0.5962264150943396,
+            "acc_stderr": 0.03019761160019795,
+            "acc_norm": 0.5962264150943396,
+            "acc_norm_stderr": 0.03019761160019795
+        },
+        "harness|mmlu_public_relations|5": {
+            "acc": 0.6181818181818182,
+            "acc_stderr": 0.046534298079135075,
+            "acc_norm": 0.6181818181818182,
+            "acc_norm_stderr": 0.046534298079135075
+        },
+        "harness|mmlu_high_school_mathematics|5": {
+            "acc": 0.37407407407407406,
+            "acc_stderr": 0.029502861128955293,
+            "acc_norm": 0.37407407407407406,
+            "acc_norm_stderr": 0.029502861128955293
+        },
+        "harness|mmlu_high_school_physics|5": {
+            "acc": 0.32450331125827814,
+            "acc_stderr": 0.038227469376587525,
+            "acc_norm": 0.32450331125827814,
+            "acc_norm_stderr": 0.038227469376587525
+        },
+        "harness|mmlu_sociology|5": {
+            "acc": 0.7164179104477612,
+            "acc_stderr": 0.03187187537919796,
+            "acc_norm": 0.7164179104477612,
+            "acc_norm_stderr": 0.03187187537919796
+        },
+        "harness|mmlu_college_medicine|5": {
+            "acc": 0.5375722543352601,
+            "acc_stderr": 0.03801685104524458,
+            "acc_norm": 0.5375722543352601,
+            "acc_norm_stderr": 0.03801685104524458
+        },
+        "harness|mmlu_elementary_mathematics|5": {
+            "acc": 0.42857142857142855,
+            "acc_stderr": 0.025487187147859372,
+            "acc_norm": 0.42857142857142855,
+            "acc_norm_stderr": 0.025487187147859372
+        },
+        "harness|mmlu_college_biology|5": {
+            "acc": 0.5902777777777778,
+            "acc_stderr": 0.04112490974670787,
+            "acc_norm": 0.5902777777777778,
+            "acc_norm_stderr": 0.04112490974670787
+        },
+        "harness|mmlu_college_chemistry|5": {
+            "acc": 0.45,
+            "acc_stderr": 0.049999999999999996,
+            "acc_norm": 0.45,
+            "acc_norm_stderr": 0.049999999999999996
+        },
+        "harness|mmlu_us_foreign_policy|5": {
+            "acc": 0.78,
+            "acc_stderr": 0.04163331998932263,
+            "acc_norm": 0.78,
+            "acc_norm_stderr": 0.04163331998932263
+        },
+        "harness|mmlu_moral_disputes|5": {
+            "acc": 0.6184971098265896,
+            "acc_stderr": 0.026152198619726803,
+            "acc_norm": 0.6184971098265896,
+            "acc_norm_stderr": 0.026152198619726803
+        },
+        "harness|mmlu_logical_fallacies|5": {
+            "acc": 0.6441717791411042,
+            "acc_stderr": 0.03761521380046734,
+            "acc_norm": 0.6441717791411042,
+            "acc_norm_stderr": 0.03761521380046734
+        },
+        "harness|mmlu_prehistory|5": {
+            "acc": 0.6944444444444444,
+            "acc_stderr": 0.025630824975621365,
+            "acc_norm": 0.6944444444444444,
+            "acc_norm_stderr": 0.025630824975621365
+        },
+        "harness|mmlu_college_mathematics|5": {
+            "acc": 0.43,
+            "acc_stderr": 0.049756985195624284,
+            "acc_norm": 0.43,
+            "acc_norm_stderr": 0.049756985195624284
+        },
+        "harness|mmlu_high_school_government_and_politics|5": {
+            "acc": 0.7927461139896373,
+            "acc_stderr": 0.029252823291803638,
+            "acc_norm": 0.7927461139896373,
+            "acc_norm_stderr": 0.029252823291803638
+        },
+        "harness|mmlu_econometrics|5": {
+            "acc": 0.43859649122807015,
+            "acc_stderr": 0.04668000738510455,
+            "acc_norm": 0.43859649122807015,
+            "acc_norm_stderr": 0.04668000738510455
+        },
+        "harness|mmlu_high_school_psychology|5": {
+            "acc": 0.7853211009174312,
+            "acc_stderr": 0.017604304149256494,
+            "acc_norm": 0.7853211009174312,
+            "acc_norm_stderr": 0.017604304149256494
+        },
+        "harness|mmlu_formal_logic|5": {
+            "acc": 0.3968253968253968,
+            "acc_stderr": 0.04375888492727062,
+            "acc_norm": 0.3968253968253968,
+            "acc_norm_stderr": 0.04375888492727062
+        },
+        "harness|mmlu_nutrition|5": {
+            "acc": 0.6437908496732027,
+            "acc_stderr": 0.027420477662629245,
+            "acc_norm": 0.6437908496732027,
+            "acc_norm_stderr": 0.027420477662629245
+        },
+        "harness|mmlu_business_ethics|5": {
+            "acc": 0.63,
+            "acc_stderr": 0.04852365870939099,
+            "acc_norm": 0.63,
+            "acc_norm_stderr": 0.04852365870939099
+        },
+        "harness|mmlu_international_law|5": {
+            "acc": 0.7603305785123967,
+            "acc_stderr": 0.03896878985070415,
+            "acc_norm": 0.7603305785123967,
+            "acc_norm_stderr": 0.03896878985070415
+        },
+        "harness|mmlu_astronomy|5": {
+            "acc": 0.625,
+            "acc_stderr": 0.039397364351956274,
+            "acc_norm": 0.625,
+            "acc_norm_stderr": 0.039397364351956274
+        },
+        "harness|mmlu_professional_psychology|5": {
+            "acc": 0.619281045751634,
+            "acc_stderr": 0.019643801557924806,
+            "acc_norm": 0.619281045751634,
+            "acc_norm_stderr": 0.019643801557924806
+        },
+        "harness|mmlu_professional_accounting|5": {
+            "acc": 0.45390070921985815,
+            "acc_stderr": 0.029700453247291467,
+            "acc_norm": 0.45390070921985815,
+            "acc_norm_stderr": 0.029700453247291467
+        },
+        "harness|mmlu_machine_learning|5": {
+            "acc": 0.41964285714285715,
+            "acc_stderr": 0.04684099321077106,
+            "acc_norm": 0.41964285714285715,
+            "acc_norm_stderr": 0.04684099321077106
+        },
+        "harness|mmlu_high_school_statistics|5": {
+            "acc": 0.5555555555555556,
+            "acc_stderr": 0.03388857118502326,
+            "acc_norm": 0.5555555555555556,
+            "acc_norm_stderr": 0.03388857118502326
+        },
+        "harness|mmlu_moral_scenarios|5": {
+            "acc": 0.3575418994413408,
+            "acc_stderr": 0.016029394474894893,
+            "acc_norm": 0.3575418994413408,
+            "acc_norm_stderr": 0.016029394474894893
+        },
+        "harness|mmlu_college_computer_science|5": {
+            "acc": 0.52,
+            "acc_stderr": 0.050211673156867795,
+            "acc_norm": 0.52,
+            "acc_norm_stderr": 0.050211673156867795
+        },
+        "harness|mmlu_high_school_computer_science|5": {
+            "acc": 0.75,
+            "acc_stderr": 0.04351941398892446,
+            "acc_norm": 0.75,
+            "acc_norm_stderr": 0.04351941398892446
+        },
+        "harness|mmlu_professional_medicine|5": {
+            "acc": 0.5735294117647058,
+            "acc_stderr": 0.03004261583271486,
+            "acc_norm": 0.5735294117647058,
+            "acc_norm_stderr": 0.03004261583271486
+        },
+        "harness|mmlu_security_studies|5": {
+            "acc": 0.6816326530612244,
+            "acc_stderr": 0.02982253379398204,
+            "acc_norm": 0.6816326530612244,
+            "acc_norm_stderr": 0.02982253379398204
+        },
+        "harness|mmlu_high_school_world_history|5": {
+            "acc": 0.7468354430379747,
+            "acc_stderr": 0.028304657943035293,
+            "acc_norm": 0.7468354430379747,
+            "acc_norm_stderr": 0.028304657943035293
+        },
+        "harness|mmlu_professional_law|5": {
+            "acc": 0.455019556714472,
+            "acc_stderr": 0.012718456618701789,
+            "acc_norm": 0.455019556714472,
+            "acc_norm_stderr": 0.012718456618701789
+        },
+        "harness|mmlu_high_school_us_history|5": {
+            "acc": 0.6666666666666666,
+            "acc_stderr": 0.033086111132364364,
+            "acc_norm": 0.6666666666666666,
+            "acc_norm_stderr": 0.033086111132364364
+        },
+        "harness|mmlu_high_school_european_history|5": {
+            "acc": 0.6484848484848484,
+            "acc_stderr": 0.037282069986826503,
+            "acc_norm": 0.6484848484848484,
+            "acc_norm_stderr": 0.037282069986826503
+        },
+        "harness|truthfulqa_mc|0": {
+            "mc1": 0.605875152998776,
+            "mc1_stderr": 0.017106588140700332,
+            "mc2": 0.7254831072808595,
+            "mc2_stderr": 0.014162522228042162
+        },
+        "harness|commongen_v2|2": {
+            "acc": 0.5926800472255017,
+            "acc_stderr": 0.01689245669519127,
+            "acc_norm": 0.6269185360094451,
+            "acc_norm_stderr": 0.016627318275137453
+        }
+    },
+    "versions": {
+        "all": 0,
+        "harness|arc_challenge|25": 0,
+        "harness|hellaswag|10": 0,
+        "harness|mmlu_world_religions|5": 1,
+        "harness|mmlu_management|5": 1,
+        "harness|mmlu_miscellaneous|5": 1,
+        "harness|mmlu_anatomy|5": 1,
+        "harness|mmlu_abstract_algebra|5": 1,
+        "harness|mmlu_conceptual_physics|5": 1,
+        "harness|mmlu_virology|5": 1,
+        "harness|mmlu_philosophy|5": 1,
+        "harness|mmlu_human_aging|5": 1,
+        "harness|mmlu_human_sexuality|5": 1,
+        "harness|mmlu_medical_genetics|5": 1,
+        "harness|mmlu_high_school_geography|5": 1,
+        "harness|mmlu_electrical_engineering|5": 1,
+        "harness|mmlu_college_physics|5": 1,
+        "harness|mmlu_high_school_microeconomics|5": 1,
+        "harness|mmlu_high_school_macroeconomics|5": 1,
+        "harness|mmlu_computer_security|5": 1,
+        "harness|mmlu_global_facts|5": 1,
+        "harness|mmlu_jurisprudence|5": 1,
+        "harness|mmlu_high_school_chemistry|5": 1,
+        "harness|mmlu_high_school_biology|5": 1,
+        "harness|mmlu_marketing|5": 1,
+        "harness|mmlu_clinical_knowledge|5": 1,
+        "harness|mmlu_public_relations|5": 1,
+        "harness|mmlu_high_school_mathematics|5": 1,
+        "harness|mmlu_high_school_physics|5": 1,
+        "harness|mmlu_sociology|5": 1,
+        "harness|mmlu_college_medicine|5": 1,
+        "harness|mmlu_elementary_mathematics|5": 1,
+        "harness|mmlu_college_biology|5": 1,
+        "harness|mmlu_college_chemistry|5": 1,
+        "harness|mmlu_us_foreign_policy|5": 1,
+        "harness|mmlu_moral_disputes|5": 1,
+        "harness|mmlu_logical_fallacies|5": 1,
+        "harness|mmlu_prehistory|5": 1,
+        "harness|mmlu_college_mathematics|5": 1,
+        "harness|mmlu_high_school_government_and_politics|5": 1,
+        "harness|mmlu_econometrics|5": 1,
+        "harness|mmlu_high_school_psychology|5": 1,
+        "harness|mmlu_formal_logic|5": 1,
+        "harness|mmlu_nutrition|5": 1,
+        "harness|mmlu_business_ethics|5": 1,
+        "harness|mmlu_international_law|5": 1,
+        "harness|mmlu_astronomy|5": 1,
+        "harness|mmlu_professional_psychology|5": 1,
+        "harness|mmlu_professional_accounting|5": 1,
+        "harness|mmlu_machine_learning|5": 1,
+        "harness|mmlu_high_school_statistics|5": 1,
+        "harness|mmlu_moral_scenarios|5": 1,
+        "harness|mmlu_college_computer_science|5": 1,
+        "harness|mmlu_high_school_computer_science|5": 1,
+        "harness|mmlu_professional_medicine|5": 1,
+        "harness|mmlu_security_studies|5": 1,
+        "harness|mmlu_high_school_world_history|5": 1,
+        "harness|mmlu_professional_law|5": 1,
+        "harness|mmlu_high_school_us_history|5": 1,
+        "harness|mmlu_high_school_european_history|5": 1,
+        "harness|truthfulqa_mc|0": 0,
+        "harness|commongen_v2|2": 1
+    },
+    "config_general": {
+        "model_name": "x2bee/POLAR-14B-SON-SFT-v0.1",
+        "model_sha": "01286a13088332c1eda4279b5bcfa7a0a33e145f",
+        "model_dtype": "torch.float16",
+        "lighteval_sha": "",
+        "num_few_shot_default": 0,
+        "num_fewshot_seeds": 1,
+        "override_batch_size": 1,
+        "max_samples": null
+    }
+}
\ No newline at end of file
diff --git a/eval-results/x2bee/POLAR-14B-v0.2/result.json b/eval-results/x2bee/POLAR-14B-v0.2/result.json
new file mode 100644
index 0000000000000000000000000000000000000000..63fae0f070c9ffe9477f498fe19733803d29c81e
--- /dev/null
+++ b/eval-results/x2bee/POLAR-14B-v0.2/result.json
@@ -0,0 +1,450 @@
+{
+    "results": {
+        "daily": {
+          "daily": 2
+        },
+        "quarterly": {
+          "quarterly": 2
+        },
+        "harness|arc_challenge|25": {
+            "acc": 0.7465870307167235,
+            "acc_stderr": 0.012710896778378602,
+            "acc_norm": 0.7687713310580204,
+            "acc_norm_stderr": 0.012320858834772264
+        },
+        "harness|hellaswag|10": {
+            "acc": 0.681736705835491,
+            "acc_stderr": 0.004648503177353952,
+            "acc_norm": 0.7999402509460267,
+            "acc_norm_stderr": 0.003992272261659531
+        },
+        "harness|mmlu_world_religions|5": {
+            "acc": 0.6549707602339181,
+            "acc_stderr": 0.036459813773888065,
+            "acc_norm": 0.6549707602339181,
+            "acc_norm_stderr": 0.036459813773888065
+        },
+        "harness|mmlu_management|5": {
+            "acc": 0.7378640776699029,
+            "acc_stderr": 0.043546310772605956,
+            "acc_norm": 0.7378640776699029,
+            "acc_norm_stderr": 0.043546310772605956
+        },
+        "harness|mmlu_miscellaneous|5": {
+            "acc": 0.6922094508301405,
+            "acc_stderr": 0.016506045045155633,
+            "acc_norm": 0.6922094508301405,
+            "acc_norm_stderr": 0.016506045045155633
+        },
+        "harness|mmlu_anatomy|5": {
+            "acc": 0.4666666666666667,
+            "acc_stderr": 0.043097329010363554,
+            "acc_norm": 0.4666666666666667,
+            "acc_norm_stderr": 0.043097329010363554
+        },
+        "harness|mmlu_abstract_algebra|5": {
+            "acc": 0.35,
+            "acc_stderr": 0.047937248544110196,
+            "acc_norm": 0.35,
+            "acc_norm_stderr": 0.047937248544110196
+        },
+        "harness|mmlu_conceptual_physics|5": {
+            "acc": 0.4595744680851064,
+            "acc_stderr": 0.03257901482099836,
+            "acc_norm": 0.4595744680851064,
+            "acc_norm_stderr": 0.03257901482099836
+        },
+        "harness|mmlu_virology|5": {
+            "acc": 0.4879518072289157,
+            "acc_stderr": 0.03891364495835821,
+            "acc_norm": 0.4879518072289157,
+            "acc_norm_stderr": 0.03891364495835821
+        },
+        "harness|mmlu_philosophy|5": {
+            "acc": 0.6045016077170418,
+            "acc_stderr": 0.027770918531427834,
+            "acc_norm": 0.6045016077170418,
+            "acc_norm_stderr": 0.027770918531427834
+        },
+        "harness|mmlu_human_aging|5": {
+            "acc": 0.6233183856502242,
+            "acc_stderr": 0.03252113489929188,
+            "acc_norm": 0.6233183856502242,
+            "acc_norm_stderr": 0.03252113489929188
+        },
+        "harness|mmlu_human_sexuality|5": {
+            "acc": 0.6412213740458015,
+            "acc_stderr": 0.04206739313864908,
+            "acc_norm": 0.6412213740458015,
+            "acc_norm_stderr": 0.04206739313864908
+        },
+        "harness|mmlu_medical_genetics|5": {
+            "acc": 0.51,
+            "acc_stderr": 0.05024183937956911,
+            "acc_norm": 0.51,
+            "acc_norm_stderr": 0.05024183937956911
+        },
+        "harness|mmlu_high_school_geography|5": {
+            "acc": 0.7222222222222222,
+            "acc_stderr": 0.03191178226713547,
+            "acc_norm": 0.7222222222222222,
+            "acc_norm_stderr": 0.03191178226713547
+        },
+        "harness|mmlu_electrical_engineering|5": {
+            "acc": 0.5241379310344828,
+            "acc_stderr": 0.0416180850350153,
+            "acc_norm": 0.5241379310344828,
+            "acc_norm_stderr": 0.0416180850350153
+        },
+        "harness|mmlu_college_physics|5": {
+            "acc": 0.3235294117647059,
+            "acc_stderr": 0.046550104113196177,
+            "acc_norm": 0.3235294117647059,
+            "acc_norm_stderr": 0.046550104113196177
+        },
+        "harness|mmlu_high_school_microeconomics|5": {
+            "acc": 0.6764705882352942,
+            "acc_stderr": 0.030388353551886793,
+            "acc_norm": 0.6764705882352942,
+            "acc_norm_stderr": 0.030388353551886793
+        },
+        "harness|mmlu_high_school_macroeconomics|5": {
+            "acc": 0.6384615384615384,
+            "acc_stderr": 0.024359581465397,
+            "acc_norm": 0.6384615384615384,
+            "acc_norm_stderr": 0.024359581465397
+        },
+        "harness|mmlu_computer_security|5": {
+            "acc": 0.65,
+            "acc_stderr": 0.0479372485441102,
+            "acc_norm": 0.65,
+            "acc_norm_stderr": 0.0479372485441102
+        },
+        "harness|mmlu_global_facts|5": {
+            "acc": 0.31,
+            "acc_stderr": 0.04648231987117316,
+            "acc_norm": 0.31,
+            "acc_norm_stderr": 0.04648231987117316
+        },
+        "harness|mmlu_jurisprudence|5": {
+            "acc": 0.6296296296296297,
+            "acc_stderr": 0.04668408033024931,
+            "acc_norm": 0.6296296296296297,
+            "acc_norm_stderr": 0.04668408033024931
+        },
+        "harness|mmlu_high_school_chemistry|5": {
+            "acc": 0.4729064039408867,
+            "acc_stderr": 0.03512819077876105,
+            "acc_norm": 0.4729064039408867,
+            "acc_norm_stderr": 0.03512819077876105
+        },
+        "harness|mmlu_high_school_biology|5": {
+            "acc": 0.5709677419354838,
+            "acc_stderr": 0.028156036538233193,
+            "acc_norm": 0.5709677419354838,
+            "acc_norm_stderr": 0.028156036538233193
+        },
+        "harness|mmlu_marketing|5": {
+            "acc": 0.8034188034188035,
+            "acc_stderr": 0.026035386098951292,
+            "acc_norm": 0.8034188034188035,
+            "acc_norm_stderr": 0.026035386098951292
+        },
+        "harness|mmlu_clinical_knowledge|5": {
+            "acc": 0.5547169811320755,
+            "acc_stderr": 0.030588052974270655,
+            "acc_norm": 0.5547169811320755,
+            "acc_norm_stderr": 0.030588052974270655
+        },
+        "harness|mmlu_public_relations|5": {
+            "acc": 0.6363636363636364,
+            "acc_stderr": 0.04607582090719976,
+            "acc_norm": 0.6363636363636364,
+            "acc_norm_stderr": 0.04607582090719976
+        },
+        "harness|mmlu_high_school_mathematics|5": {
+            "acc": 0.3592592592592593,
+            "acc_stderr": 0.029252905927251976,
+            "acc_norm": 0.3592592592592593,
+            "acc_norm_stderr": 0.029252905927251976
+        },
+        "harness|mmlu_high_school_physics|5": {
+            "acc": 0.3576158940397351,
+            "acc_stderr": 0.03913453431177258,
+            "acc_norm": 0.3576158940397351,
+            "acc_norm_stderr": 0.03913453431177258
+        },
+        "harness|mmlu_sociology|5": {
+            "acc": 0.6268656716417911,
+            "acc_stderr": 0.034198326081760065,
+            "acc_norm": 0.6268656716417911,
+            "acc_norm_stderr": 0.034198326081760065
+        },
+        "harness|mmlu_college_medicine|5": {
+            "acc": 0.48554913294797686,
+            "acc_stderr": 0.03810871630454764,
+            "acc_norm": 0.48554913294797686,
+            "acc_norm_stderr": 0.03810871630454764
+        },
+        "harness|mmlu_elementary_mathematics|5": {
+            "acc": 0.4497354497354497,
+            "acc_stderr": 0.025620857042936648,
+            "acc_norm": 0.4497354497354497,
+            "acc_norm_stderr": 0.025620857042936648
+        },
+        "harness|mmlu_college_biology|5": {
+            "acc": 0.6041666666666666,
+            "acc_stderr": 0.04089465449325582,
+            "acc_norm": 0.6041666666666666,
+            "acc_norm_stderr": 0.04089465449325582
+        },
+        "harness|mmlu_college_chemistry|5": {
+            "acc": 0.32,
+            "acc_stderr": 0.046882617226215034,
+            "acc_norm": 0.32,
+            "acc_norm_stderr": 0.046882617226215034
+        },
+        "harness|mmlu_us_foreign_policy|5": {
+            "acc": 0.71,
+            "acc_stderr": 0.045604802157206824,
+            "acc_norm": 0.71,
+            "acc_norm_stderr": 0.045604802157206824
+        },
+        "harness|mmlu_moral_disputes|5": {
+            "acc": 0.5664739884393064,
+            "acc_stderr": 0.026680134761679217,
+            "acc_norm": 0.5664739884393064,
+            "acc_norm_stderr": 0.026680134761679217
+        },
+        "harness|mmlu_logical_fallacies|5": {
+            "acc": 0.6196319018404908,
+            "acc_stderr": 0.038142698932618374,
+            "acc_norm": 0.6196319018404908,
+            "acc_norm_stderr": 0.038142698932618374
+        },
+        "harness|mmlu_prehistory|5": {
+            "acc": 0.6574074074074074,
+            "acc_stderr": 0.026406145973625686,
+            "acc_norm": 0.6574074074074074,
+            "acc_norm_stderr": 0.026406145973625686
+        },
+        "harness|mmlu_college_mathematics|5": {
+            "acc": 0.37,
+            "acc_stderr": 0.04852365870939098,
+            "acc_norm": 0.37,
+            "acc_norm_stderr": 0.04852365870939098
+        },
+        "harness|mmlu_high_school_government_and_politics|5": {
+            "acc": 0.7616580310880829,
+            "acc_stderr": 0.030748905363909895,
+            "acc_norm": 0.7616580310880829,
+            "acc_norm_stderr": 0.030748905363909895
+        },
+        "harness|mmlu_econometrics|5": {
+            "acc": 0.5,
+            "acc_stderr": 0.047036043419179864,
+            "acc_norm": 0.5,
+            "acc_norm_stderr": 0.047036043419179864
+        },
+        "harness|mmlu_high_school_psychology|5": {
+            "acc": 0.7211009174311926,
+            "acc_stderr": 0.01922746887646353,
+            "acc_norm": 0.7211009174311926,
+            "acc_norm_stderr": 0.01922746887646353
+        },
+        "harness|mmlu_formal_logic|5": {
+            "acc": 0.42857142857142855,
+            "acc_stderr": 0.0442626668137991,
+            "acc_norm": 0.42857142857142855,
+            "acc_norm_stderr": 0.0442626668137991
+        },
+        "harness|mmlu_nutrition|5": {
+            "acc": 0.5816993464052288,
+            "acc_stderr": 0.0282451340243873,
+            "acc_norm": 0.5816993464052288,
+            "acc_norm_stderr": 0.0282451340243873
+        },
+        "harness|mmlu_business_ethics|5": {
+            "acc": 0.73,
+            "acc_stderr": 0.044619604333847394,
+            "acc_norm": 0.73,
+            "acc_norm_stderr": 0.044619604333847394
+        },
+        "harness|mmlu_international_law|5": {
+            "acc": 0.7107438016528925,
+            "acc_stderr": 0.041391127276354626,
+            "acc_norm": 0.7107438016528925,
+            "acc_norm_stderr": 0.041391127276354626
+        },
+        "harness|mmlu_astronomy|5": {
+            "acc": 0.6513157894736842,
+            "acc_stderr": 0.038781398887976104,
+            "acc_norm": 0.6513157894736842,
+            "acc_norm_stderr": 0.038781398887976104
+        },
+        "harness|mmlu_professional_psychology|5": {
+            "acc": 0.5686274509803921,
+            "acc_stderr": 0.020036393768352624,
+            "acc_norm": 0.5686274509803921,
+            "acc_norm_stderr": 0.020036393768352624
+        },
+        "harness|mmlu_professional_accounting|5": {
+            "acc": 0.45390070921985815,
+            "acc_stderr": 0.029700453247291477,
+            "acc_norm": 0.45390070921985815,
+            "acc_norm_stderr": 0.029700453247291477
+        },
+        "harness|mmlu_machine_learning|5": {
+            "acc": 0.4642857142857143,
+            "acc_stderr": 0.04733667890053756,
+            "acc_norm": 0.4642857142857143,
+            "acc_norm_stderr": 0.04733667890053756
+        },
+        "harness|mmlu_high_school_statistics|5": {
+            "acc": 0.5092592592592593,
+            "acc_stderr": 0.034093869469927006,
+            "acc_norm": 0.5092592592592593,
+            "acc_norm_stderr": 0.034093869469927006
+        },
+        "harness|mmlu_moral_scenarios|5": {
+            "acc": 0.37206703910614525,
+            "acc_stderr": 0.016165847583563295,
+            "acc_norm": 0.37206703910614525,
+            "acc_norm_stderr": 0.016165847583563295
+        },
+        "harness|mmlu_college_computer_science|5": {
+            "acc": 0.43,
+            "acc_stderr": 0.049756985195624284,
+            "acc_norm": 0.43,
+            "acc_norm_stderr": 0.049756985195624284
+        },
+        "harness|mmlu_high_school_computer_science|5": {
+            "acc": 0.71,
+            "acc_stderr": 0.045604802157206845,
+            "acc_norm": 0.71,
+            "acc_norm_stderr": 0.045604802157206845
+        },
+        "harness|mmlu_professional_medicine|5": {
+            "acc": 0.5404411764705882,
+            "acc_stderr": 0.030273325077345755,
+            "acc_norm": 0.5404411764705882,
+            "acc_norm_stderr": 0.030273325077345755
+        },
+        "harness|mmlu_security_studies|5": {
+            "acc": 0.6122448979591837,
+            "acc_stderr": 0.03119223072679566,
+            "acc_norm": 0.6122448979591837,
+            "acc_norm_stderr": 0.03119223072679566
+        },
+        "harness|mmlu_high_school_world_history|5": {
+            "acc": 0.7257383966244726,
+            "acc_stderr": 0.029041333510598025,
+            "acc_norm": 0.7257383966244726,
+            "acc_norm_stderr": 0.029041333510598025
+        },
+        "harness|mmlu_professional_law|5": {
+            "acc": 0.4641460234680574,
+            "acc_stderr": 0.01273736131873058,
+            "acc_norm": 0.4641460234680574,
+            "acc_norm_stderr": 0.01273736131873058
+        },
+        "harness|mmlu_high_school_us_history|5": {
+            "acc": 0.6568627450980392,
+            "acc_stderr": 0.03332139944668086,
+            "acc_norm": 0.6568627450980392,
+            "acc_norm_stderr": 0.03332139944668086
+        },
+        "harness|mmlu_high_school_european_history|5": {
+            "acc": 0.6,
+            "acc_stderr": 0.03825460278380025,
+            "acc_norm": 0.6,
+            "acc_norm_stderr": 0.03825460278380025
+        },
+        "harness|truthfulqa_mc|0": {
+            "mc1": 0.7246022031823746,
+            "mc1_stderr": 0.01563813566777552,
+            "mc2": 0.8107575910195236,
+            "mc2_stderr": 0.013335029489665237
+        },
+        "harness|commongen_v2|2": {
+            "acc": 0.525383707201889,
+            "acc_stderr": 0.017168187201429253,
+            "acc_norm": 0.5442739079102715,
+            "acc_norm_stderr": 0.017122829143292655
+        }
+    },
+    "versions": {
+        "all": 0,
+        "harness|arc_challenge|25": 0,
+        "harness|hellaswag|10": 0,
+        "harness|mmlu_world_religions|5": 1,
+        "harness|mmlu_management|5": 1,
+        "harness|mmlu_miscellaneous|5": 1,
+        "harness|mmlu_anatomy|5": 1,
+        "harness|mmlu_abstract_algebra|5": 1,
+        "harness|mmlu_conceptual_physics|5": 1,
+        "harness|mmlu_virology|5": 1,
+        "harness|mmlu_philosophy|5": 1,
+        "harness|mmlu_human_aging|5": 1,
+        "harness|mmlu_human_sexuality|5": 1,
+        "harness|mmlu_medical_genetics|5": 1,
+        "harness|mmlu_high_school_geography|5": 1,
+        "harness|mmlu_electrical_engineering|5": 1,
+        "harness|mmlu_college_physics|5": 1,
+        "harness|mmlu_high_school_microeconomics|5": 1,
+        "harness|mmlu_high_school_macroeconomics|5": 1,
+        "harness|mmlu_computer_security|5": 1,
+        "harness|mmlu_global_facts|5": 1,
+        "harness|mmlu_jurisprudence|5": 1,
+        "harness|mmlu_high_school_chemistry|5": 1,
+        "harness|mmlu_high_school_biology|5": 1,
+        "harness|mmlu_marketing|5": 1,
+        "harness|mmlu_clinical_knowledge|5": 1,
+        "harness|mmlu_public_relations|5": 1,
+        "harness|mmlu_high_school_mathematics|5": 1,
+        "harness|mmlu_high_school_physics|5": 1,
+        "harness|mmlu_sociology|5": 1,
+        "harness|mmlu_college_medicine|5": 1,
+        "harness|mmlu_elementary_mathematics|5": 1,
+        "harness|mmlu_college_biology|5": 1,
+        "harness|mmlu_college_chemistry|5": 1,
+        "harness|mmlu_us_foreign_policy|5": 1,
+        "harness|mmlu_moral_disputes|5": 1,
+        "harness|mmlu_logical_fallacies|5": 1,
+        "harness|mmlu_prehistory|5": 1,
+        "harness|mmlu_college_mathematics|5": 1,
+        "harness|mmlu_high_school_government_and_politics|5": 1,
+        "harness|mmlu_econometrics|5": 1,
+        "harness|mmlu_high_school_psychology|5": 1,
+        "harness|mmlu_formal_logic|5": 1,
+        "harness|mmlu_nutrition|5": 1,
+        "harness|mmlu_business_ethics|5": 1,
+        "harness|mmlu_international_law|5": 1,
+        "harness|mmlu_astronomy|5": 1,
+        "harness|mmlu_professional_psychology|5": 1,
+        "harness|mmlu_professional_accounting|5": 1,
+        "harness|mmlu_machine_learning|5": 1,
+        "harness|mmlu_high_school_statistics|5": 1,
+        "harness|mmlu_moral_scenarios|5": 1,
+        "harness|mmlu_college_computer_science|5": 1,
+        "harness|mmlu_high_school_computer_science|5": 1,
+        "harness|mmlu_professional_medicine|5": 1,
+        "harness|mmlu_security_studies|5": 1,
+        "harness|mmlu_high_school_world_history|5": 1,
+        "harness|mmlu_professional_law|5": 1,
+        "harness|mmlu_high_school_us_history|5": 1,
+        "harness|mmlu_high_school_european_history|5": 1,
+        "harness|truthfulqa_mc|0": 0,
+        "harness|commongen_v2|2": 1
+    },
+    "config_general": {
+        "model_name": "x2bee/POLAR-14B-v0.2",
+        "model_sha": "8d905623a3972e11260420130039c62e115cbbaa",
+        "model_dtype": "torch.float16",
+        "lighteval_sha": "",
+        "num_few_shot_default": 0,
+        "num_fewshot_seeds": 1,
+        "override_batch_size": 1,
+        "max_samples": null
+    }
+}
\ No newline at end of file
diff --git a/eval-results/x2bee/POLAR-14B-v0.5/result.json b/eval-results/x2bee/POLAR-14B-v0.5/result.json
new file mode 100644
index 0000000000000000000000000000000000000000..8cd609e2bdf51cc5b4d666ff5ee4db16a30aed52
--- /dev/null
+++ b/eval-results/x2bee/POLAR-14B-v0.5/result.json
@@ -0,0 +1,450 @@
+{
+    "results": {
+        "daily": {
+          "daily": 1
+        },
+        "quarterly": {
+          "quarterly": 1
+        },
+        "harness|arc_challenge|25": {
+            "acc": 0.75,
+            "acc_stderr": 0.012653835621466646,
+            "acc_norm": 0.7798634812286689,
+            "acc_norm_stderr": 0.012108124883460988
+        },
+        "harness|hellaswag|10": {
+            "acc": 0.6500697072296355,
+            "acc_stderr": 0.004759729267943182,
+            "acc_norm": 0.775542720573591,
+            "acc_norm_stderr": 0.004163717220873764
+        },
+        "harness|mmlu_world_religions|5": {
+            "acc": 0.6374269005847953,
+            "acc_stderr": 0.036871306155620606,
+            "acc_norm": 0.6374269005847953,
+            "acc_norm_stderr": 0.036871306155620606
+        },
+        "harness|mmlu_management|5": {
+            "acc": 0.7087378640776699,
+            "acc_stderr": 0.044986763205729224,
+            "acc_norm": 0.7087378640776699,
+            "acc_norm_stderr": 0.044986763205729224
+        },
+        "harness|mmlu_miscellaneous|5": {
+            "acc": 0.6730523627075351,
+            "acc_stderr": 0.016774908180131484,
+            "acc_norm": 0.6730523627075351,
+            "acc_norm_stderr": 0.016774908180131484
+        },
+        "harness|mmlu_anatomy|5": {
+            "acc": 0.45185185185185184,
+            "acc_stderr": 0.04299268905480864,
+            "acc_norm": 0.45185185185185184,
+            "acc_norm_stderr": 0.04299268905480864
+        },
+        "harness|mmlu_abstract_algebra|5": {
+            "acc": 0.36,
+            "acc_stderr": 0.048241815132442176,
+            "acc_norm": 0.36,
+            "acc_norm_stderr": 0.048241815132442176
+        },
+        "harness|mmlu_conceptual_physics|5": {
+            "acc": 0.4723404255319149,
+            "acc_stderr": 0.03263597118409769,
+            "acc_norm": 0.4723404255319149,
+            "acc_norm_stderr": 0.03263597118409769
+        },
+        "harness|mmlu_virology|5": {
+            "acc": 0.46987951807228917,
+            "acc_stderr": 0.03885425420866766,
+            "acc_norm": 0.46987951807228917,
+            "acc_norm_stderr": 0.03885425420866766
+        },
+        "harness|mmlu_philosophy|5": {
+            "acc": 0.594855305466238,
+            "acc_stderr": 0.027882383791325963,
+            "acc_norm": 0.594855305466238,
+            "acc_norm_stderr": 0.027882383791325963
+        },
+        "harness|mmlu_human_aging|5": {
+            "acc": 0.6412556053811659,
+            "acc_stderr": 0.032190792004199956,
+            "acc_norm": 0.6412556053811659,
+            "acc_norm_stderr": 0.032190792004199956
+        },
+        "harness|mmlu_human_sexuality|5": {
+            "acc": 0.5954198473282443,
+            "acc_stderr": 0.043046937953806645,
+            "acc_norm": 0.5954198473282443,
+            "acc_norm_stderr": 0.043046937953806645
+        },
+        "harness|mmlu_medical_genetics|5": {
+            "acc": 0.5,
+            "acc_stderr": 0.050251890762960605,
+            "acc_norm": 0.5,
+            "acc_norm_stderr": 0.050251890762960605
+        },
+        "harness|mmlu_high_school_geography|5": {
+            "acc": 0.7272727272727273,
+            "acc_stderr": 0.03173071239071724,
+            "acc_norm": 0.7272727272727273,
+            "acc_norm_stderr": 0.03173071239071724
+        },
+        "harness|mmlu_electrical_engineering|5": {
+            "acc": 0.503448275862069,
+            "acc_stderr": 0.0416656757710158,
+            "acc_norm": 0.503448275862069,
+            "acc_norm_stderr": 0.0416656757710158
+        },
+        "harness|mmlu_college_physics|5": {
+            "acc": 0.3431372549019608,
+            "acc_stderr": 0.04724007352383888,
+            "acc_norm": 0.3431372549019608,
+            "acc_norm_stderr": 0.04724007352383888
+        },
+        "harness|mmlu_high_school_microeconomics|5": {
+            "acc": 0.6596638655462185,
+            "acc_stderr": 0.03077805742293167,
+            "acc_norm": 0.6596638655462185,
+            "acc_norm_stderr": 0.03077805742293167
+        },
+        "harness|mmlu_high_school_macroeconomics|5": {
+            "acc": 0.6102564102564103,
+            "acc_stderr": 0.024726967886647078,
+            "acc_norm": 0.6102564102564103,
+            "acc_norm_stderr": 0.024726967886647078
+        },
+        "harness|mmlu_computer_security|5": {
+            "acc": 0.67,
+            "acc_stderr": 0.047258156262526094,
+            "acc_norm": 0.67,
+            "acc_norm_stderr": 0.047258156262526094
+        },
+        "harness|mmlu_global_facts|5": {
+            "acc": 0.33,
+            "acc_stderr": 0.047258156262526045,
+            "acc_norm": 0.33,
+            "acc_norm_stderr": 0.047258156262526045
+        },
+        "harness|mmlu_jurisprudence|5": {
+            "acc": 0.6481481481481481,
+            "acc_stderr": 0.04616631111801714,
+            "acc_norm": 0.6481481481481481,
+            "acc_norm_stderr": 0.04616631111801714
+        },
+        "harness|mmlu_high_school_chemistry|5": {
+            "acc": 0.4729064039408867,
+            "acc_stderr": 0.03512819077876105,
+            "acc_norm": 0.4729064039408867,
+            "acc_norm_stderr": 0.03512819077876105
+        },
+        "harness|mmlu_high_school_biology|5": {
+            "acc": 0.5709677419354838,
+            "acc_stderr": 0.028156036538233193,
+            "acc_norm": 0.5709677419354838,
+            "acc_norm_stderr": 0.028156036538233193
+        },
+        "harness|mmlu_marketing|5": {
+            "acc": 0.7735042735042735,
+            "acc_stderr": 0.027421007295392943,
+            "acc_norm": 0.7735042735042735,
+            "acc_norm_stderr": 0.027421007295392943
+        },
+        "harness|mmlu_clinical_knowledge|5": {
+            "acc": 0.5660377358490566,
+            "acc_stderr": 0.030503292013342596,
+            "acc_norm": 0.5660377358490566,
+            "acc_norm_stderr": 0.030503292013342596
+        },
+        "harness|mmlu_public_relations|5": {
+            "acc": 0.6272727272727273,
+            "acc_stderr": 0.04631381319425465,
+            "acc_norm": 0.6272727272727273,
+            "acc_norm_stderr": 0.04631381319425465
+        },
+        "harness|mmlu_high_school_mathematics|5": {
+            "acc": 0.3333333333333333,
+            "acc_stderr": 0.0287420409039485,
+            "acc_norm": 0.3333333333333333,
+            "acc_norm_stderr": 0.0287420409039485
+        },
+        "harness|mmlu_high_school_physics|5": {
+            "acc": 0.39072847682119205,
+            "acc_stderr": 0.039837983066598075,
+            "acc_norm": 0.39072847682119205,
+            "acc_norm_stderr": 0.039837983066598075
+        },
+        "harness|mmlu_sociology|5": {
+            "acc": 0.6417910447761194,
+            "acc_stderr": 0.03390393042268814,
+            "acc_norm": 0.6417910447761194,
+            "acc_norm_stderr": 0.03390393042268814
+        },
+        "harness|mmlu_college_medicine|5": {
+            "acc": 0.5028901734104047,
+            "acc_stderr": 0.038124005659748335,
+            "acc_norm": 0.5028901734104047,
+            "acc_norm_stderr": 0.038124005659748335
+        },
+        "harness|mmlu_elementary_mathematics|5": {
+            "acc": 0.42857142857142855,
+            "acc_stderr": 0.025487187147859372,
+            "acc_norm": 0.42857142857142855,
+            "acc_norm_stderr": 0.025487187147859372
+        },
+        "harness|mmlu_college_biology|5": {
+            "acc": 0.6180555555555556,
+            "acc_stderr": 0.040629907841466674,
+            "acc_norm": 0.6180555555555556,
+            "acc_norm_stderr": 0.040629907841466674
+        },
+        "harness|mmlu_college_chemistry|5": {
+            "acc": 0.3,
+            "acc_stderr": 0.046056618647183814,
+            "acc_norm": 0.3,
+            "acc_norm_stderr": 0.046056618647183814
+        },
+        "harness|mmlu_us_foreign_policy|5": {
+            "acc": 0.72,
+            "acc_stderr": 0.04512608598542127,
+            "acc_norm": 0.72,
+            "acc_norm_stderr": 0.04512608598542127
+        },
+        "harness|mmlu_moral_disputes|5": {
+            "acc": 0.5809248554913294,
+            "acc_stderr": 0.026564178111422622,
+            "acc_norm": 0.5809248554913294,
+            "acc_norm_stderr": 0.026564178111422622
+        },
+        "harness|mmlu_logical_fallacies|5": {
+            "acc": 0.6257668711656442,
+            "acc_stderr": 0.03802068102899615,
+            "acc_norm": 0.6257668711656442,
+            "acc_norm_stderr": 0.03802068102899615
+        },
+        "harness|mmlu_prehistory|5": {
+            "acc": 0.5987654320987654,
+            "acc_stderr": 0.027272582849839803,
+            "acc_norm": 0.5987654320987654,
+            "acc_norm_stderr": 0.027272582849839803
+        },
+        "harness|mmlu_college_mathematics|5": {
+            "acc": 0.34,
+            "acc_stderr": 0.04760952285695235,
+            "acc_norm": 0.34,
+            "acc_norm_stderr": 0.04760952285695235
+        },
+        "harness|mmlu_high_school_government_and_politics|5": {
+            "acc": 0.7512953367875648,
+            "acc_stderr": 0.031195840877700304,
+            "acc_norm": 0.7512953367875648,
+            "acc_norm_stderr": 0.031195840877700304
+        },
+        "harness|mmlu_econometrics|5": {
+            "acc": 0.47368421052631576,
+            "acc_stderr": 0.046970851366478626,
+            "acc_norm": 0.47368421052631576,
+            "acc_norm_stderr": 0.046970851366478626
+        },
+        "harness|mmlu_high_school_psychology|5": {
+            "acc": 0.7229357798165138,
+            "acc_stderr": 0.019188482590169538,
+            "acc_norm": 0.7229357798165138,
+            "acc_norm_stderr": 0.019188482590169538
+        },
+        "harness|mmlu_formal_logic|5": {
+            "acc": 0.4523809523809524,
+            "acc_stderr": 0.044518079590553275,
+            "acc_norm": 0.4523809523809524,
+            "acc_norm_stderr": 0.044518079590553275
+        },
+        "harness|mmlu_nutrition|5": {
+            "acc": 0.5718954248366013,
+            "acc_stderr": 0.028332397483664278,
+            "acc_norm": 0.5718954248366013,
+            "acc_norm_stderr": 0.028332397483664278
+        },
+        "harness|mmlu_business_ethics|5": {
+            "acc": 0.68,
+            "acc_stderr": 0.04688261722621504,
+            "acc_norm": 0.68,
+            "acc_norm_stderr": 0.04688261722621504
+        },
+        "harness|mmlu_international_law|5": {
+            "acc": 0.7520661157024794,
+            "acc_stderr": 0.039418975265163025,
+            "acc_norm": 0.7520661157024794,
+            "acc_norm_stderr": 0.039418975265163025
+        },
+        "harness|mmlu_astronomy|5": {
+            "acc": 0.618421052631579,
+            "acc_stderr": 0.03953173377749194,
+            "acc_norm": 0.618421052631579,
+            "acc_norm_stderr": 0.03953173377749194
+        },
+        "harness|mmlu_professional_psychology|5": {
+            "acc": 0.5408496732026143,
+            "acc_stderr": 0.020160213617222516,
+            "acc_norm": 0.5408496732026143,
+            "acc_norm_stderr": 0.020160213617222516
+        },
+        "harness|mmlu_professional_accounting|5": {
+            "acc": 0.45390070921985815,
+            "acc_stderr": 0.029700453247291463,
+            "acc_norm": 0.45390070921985815,
+            "acc_norm_stderr": 0.029700453247291463
+        },
+        "harness|mmlu_machine_learning|5": {
+            "acc": 0.44642857142857145,
+            "acc_stderr": 0.04718471485219588,
+            "acc_norm": 0.44642857142857145,
+            "acc_norm_stderr": 0.04718471485219588
+        },
+        "harness|mmlu_high_school_statistics|5": {
+            "acc": 0.5416666666666666,
+            "acc_stderr": 0.03398110890294636,
+            "acc_norm": 0.5416666666666666,
+            "acc_norm_stderr": 0.03398110890294636
+        },
+        "harness|mmlu_moral_scenarios|5": {
+            "acc": 0.35195530726256985,
+            "acc_stderr": 0.01597266852368907,
+            "acc_norm": 0.35195530726256985,
+            "acc_norm_stderr": 0.01597266852368907
+        },
+        "harness|mmlu_college_computer_science|5": {
+            "acc": 0.44,
+            "acc_stderr": 0.0498887651569859,
+            "acc_norm": 0.44,
+            "acc_norm_stderr": 0.0498887651569859
+        },
+        "harness|mmlu_high_school_computer_science|5": {
+            "acc": 0.68,
+            "acc_stderr": 0.04688261722621503,
+            "acc_norm": 0.68,
+            "acc_norm_stderr": 0.04688261722621503
+        },
+        "harness|mmlu_professional_medicine|5": {
+            "acc": 0.5147058823529411,
+            "acc_stderr": 0.03035969707904612,
+            "acc_norm": 0.5147058823529411,
+            "acc_norm_stderr": 0.03035969707904612
+        },
+        "harness|mmlu_security_studies|5": {
+            "acc": 0.6122448979591837,
+            "acc_stderr": 0.031192230726795656,
+            "acc_norm": 0.6122448979591837,
+            "acc_norm_stderr": 0.031192230726795656
+        },
+        "harness|mmlu_high_school_world_history|5": {
+            "acc": 0.7215189873417721,
+            "acc_stderr": 0.029178682304842538,
+            "acc_norm": 0.7215189873417721,
+            "acc_norm_stderr": 0.029178682304842538
+        },
+        "harness|mmlu_professional_law|5": {
+            "acc": 0.4634941329856584,
+            "acc_stderr": 0.012736153390214963,
+            "acc_norm": 0.4634941329856584,
+            "acc_norm_stderr": 0.012736153390214963
+        },
+        "harness|mmlu_high_school_us_history|5": {
+            "acc": 0.6568627450980392,
+            "acc_stderr": 0.03332139944668086,
+            "acc_norm": 0.6568627450980392,
+            "acc_norm_stderr": 0.03332139944668086
+        },
+        "harness|mmlu_high_school_european_history|5": {
+            "acc": 0.5818181818181818,
+            "acc_stderr": 0.03851716319398393,
+            "acc_norm": 0.5818181818181818,
+            "acc_norm_stderr": 0.03851716319398393
+        },
+        "harness|truthfulqa_mc|0": {
+            "mc1": 0.7833537331701347,
+            "mc1_stderr": 0.014421468452506978,
+            "mc2": 0.8572574997405501,
+            "mc2_stderr": 0.01200311225898601
+        },
+        "harness|commongen_v2|2": {
+            "acc": 0.5159386068476978,
+            "acc_stderr": 0.017181617837190195,
+            "acc_norm": 0.5301062573789846,
+            "acc_norm_stderr": 0.01715916359017022
+        }
+    },
+    "versions": {
+        "all": 0,
+        "harness|arc_challenge|25": 0,
+        "harness|hellaswag|10": 0,
+        "harness|mmlu_world_religions|5": 1,
+        "harness|mmlu_management|5": 1,
+        "harness|mmlu_miscellaneous|5": 1,
+        "harness|mmlu_anatomy|5": 1,
+        "harness|mmlu_abstract_algebra|5": 1,
+        "harness|mmlu_conceptual_physics|5": 1,
+        "harness|mmlu_virology|5": 1,
+        "harness|mmlu_philosophy|5": 1,
+        "harness|mmlu_human_aging|5": 1,
+        "harness|mmlu_human_sexuality|5": 1,
+        "harness|mmlu_medical_genetics|5": 1,
+        "harness|mmlu_high_school_geography|5": 1,
+        "harness|mmlu_electrical_engineering|5": 1,
+        "harness|mmlu_college_physics|5": 1,
+        "harness|mmlu_high_school_microeconomics|5": 1,
+        "harness|mmlu_high_school_macroeconomics|5": 1,
+        "harness|mmlu_computer_security|5": 1,
+        "harness|mmlu_global_facts|5": 1,
+        "harness|mmlu_jurisprudence|5": 1,
+        "harness|mmlu_high_school_chemistry|5": 1,
+        "harness|mmlu_high_school_biology|5": 1,
+        "harness|mmlu_marketing|5": 1,
+        "harness|mmlu_clinical_knowledge|5": 1,
+        "harness|mmlu_public_relations|5": 1,
+        "harness|mmlu_high_school_mathematics|5": 1,
+        "harness|mmlu_high_school_physics|5": 1,
+        "harness|mmlu_sociology|5": 1,
+        "harness|mmlu_college_medicine|5": 1,
+        "harness|mmlu_elementary_mathematics|5": 1,
+        "harness|mmlu_college_biology|5": 1,
+        "harness|mmlu_college_chemistry|5": 1,
+        "harness|mmlu_us_foreign_policy|5": 1,
+        "harness|mmlu_moral_disputes|5": 1,
+        "harness|mmlu_logical_fallacies|5": 1,
+        "harness|mmlu_prehistory|5": 1,
+        "harness|mmlu_college_mathematics|5": 1,
+        "harness|mmlu_high_school_government_and_politics|5": 1,
+        "harness|mmlu_econometrics|5": 1,
+        "harness|mmlu_high_school_psychology|5": 1,
+        "harness|mmlu_formal_logic|5": 1,
+        "harness|mmlu_nutrition|5": 1,
+        "harness|mmlu_business_ethics|5": 1,
+        "harness|mmlu_international_law|5": 1,
+        "harness|mmlu_astronomy|5": 1,
+        "harness|mmlu_professional_psychology|5": 1,
+        "harness|mmlu_professional_accounting|5": 1,
+        "harness|mmlu_machine_learning|5": 1,
+        "harness|mmlu_high_school_statistics|5": 1,
+        "harness|mmlu_moral_scenarios|5": 1,
+        "harness|mmlu_college_computer_science|5": 1,
+        "harness|mmlu_high_school_computer_science|5": 1,
+        "harness|mmlu_professional_medicine|5": 1,
+        "harness|mmlu_security_studies|5": 1,
+        "harness|mmlu_high_school_world_history|5": 1,
+        "harness|mmlu_professional_law|5": 1,
+        "harness|mmlu_high_school_us_history|5": 1,
+        "harness|mmlu_high_school_european_history|5": 1,
+        "harness|truthfulqa_mc|0": 0,
+        "harness|commongen_v2|2": 1
+    },
+    "config_general": {
+        "model_name": "x2bee/POLAR-14B-v0.5",
+        "model_sha": "74a1ef65a8d650e5358be229def31688738d8c6a",
+        "model_dtype": "torch.float16",
+        "lighteval_sha": "",
+        "num_few_shot_default": 0,
+        "num_fewshot_seeds": 1,
+        "override_batch_size": 1,
+        "max_samples": null
+    }
+}
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100755
index 0000000000000000000000000000000000000000..ed4ea4ad1c28eca4cafb4fc1925a92446a8d9efc
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,19 @@
+APScheduler==3.10.1
+black==23.11.0
+click==8.1.3
+datasets==2.14.5
+gradio==4.19.2
+gradio_client==0.10.1
+huggingface-hub>=0.18.0
+matplotlib==3.7.1
+numpy==1.24.2
+pandas==2.0.0
+plotly==5.14.1
+python-dateutil==2.8.2
+requests==2.28.2
+sentencepiece
+tqdm==4.65.0
+transformers==4.38.2
+tokenizers>=0.15.0
+gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.1.2 # CI !!!
+torch
\ No newline at end of file
diff --git a/scripts/create_request_file.py b/scripts/create_request_file.py
new file mode 100755
index 0000000000000000000000000000000000000000..56edc72e65a479d05d4254e1a90faa214dd9cb05
--- /dev/null
+++ b/scripts/create_request_file.py
@@ -0,0 +1,107 @@
+import json
+import os
+import pprint
+import re
+from datetime import datetime, timezone
+
+import click
+from colorama import Fore
+from huggingface_hub import HfApi, snapshot_download
+
+EVAL_REQUESTS_PATH = "eval-queue"
+QUEUE_REPO = "open-ko-llm-leaderboard/requests"
+
+precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
+model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
+weight_types = ("Original", "Delta", "Adapter")
+
+
+def get_model_size(model_info, precision: str):
+    size_pattern =  re.compile(r"(\d+\.)?\d+(b|m)")
+    try:
+        model_size = round(model_info.safetensors["total"] / 1e9, 3)
+    except (AttributeError, TypeError):
+        try:
+            size_match = re.search(size_pattern, model_info.modelId.lower())
+            model_size = size_match.group(0)
+            model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
+        except AttributeError:
+            return 0  # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
+
+    size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
+    model_size = size_factor * model_size
+    return model_size
+
+
+def main():
+    api = HfApi()
+    current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+    snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH, repo_type="dataset")
+
+    model_name = click.prompt("Enter model name")
+    revision = click.prompt("Enter revision", default="main")
+    precision = click.prompt("Enter precision", default="float16", type=click.Choice(precisions))
+    model_type = click.prompt("Enter model type", type=click.Choice(model_types))
+    weight_type = click.prompt("Enter weight type", default="Original", type=click.Choice(weight_types))
+    base_model = click.prompt("Enter base model", default="")
+    status = click.prompt("Enter status", default="FINISHED")
+
+    try:
+        model_info = api.model_info(repo_id=model_name, revision=revision)
+    except Exception as e:
+        print(f"{Fore.RED}Could not find model info for {model_name} on the Hub\n{e}{Fore.RESET}")
+        return 1
+
+    model_size = get_model_size(model_info=model_info, precision=precision)
+
+    try:
+        license = model_info.cardData["license"]
+    except Exception:
+        license = "?"
+
+    eval_entry = {
+        "model": model_name,
+        "base_model": base_model,
+        "revision": revision,
+        "private": False,
+        "precision": precision,
+        "weight_type": weight_type,
+        "status": status,
+        "submitted_time": current_time,
+        "model_type": model_type,
+        "likes": model_info.likes,
+        "params": model_size,
+        "license": license,
+    }
+
+    user_name = ""
+    model_path = model_name
+    if "/" in model_name:
+        user_name = model_name.split("/")[0]
+        model_path = model_name.split("/")[1]
+
+    pprint.pprint(eval_entry)
+
+    if click.confirm("Do you want to continue? This request file will be pushed to the hub"):
+        click.echo("continuing...")
+
+        out_dir = f"{EVAL_REQUESTS_PATH}/{user_name}"
+        os.makedirs(out_dir, exist_ok=True)
+        out_path = f"{out_dir}/{model_path}_eval_request_{False}_{precision}_{weight_type}.json"
+
+        with open(out_path, "w") as f:
+            f.write(json.dumps(eval_entry))
+
+        api.upload_file(
+            path_or_fileobj=out_path,
+            path_in_repo=out_path.split(f"{EVAL_REQUESTS_PATH}/")[1],
+            repo_id=QUEUE_REPO,
+            repo_type="dataset",
+            commit_message=f"Add {model_name} to eval queue",
+        )
+    else:
+        click.echo("aborting...")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/update_request_files.py b/scripts/update_request_files.py
new file mode 100755
index 0000000000000000000000000000000000000000..5a15079c799a8a592dda0197e4d53e26f7b67a75
--- /dev/null
+++ b/scripts/update_request_files.py
@@ -0,0 +1,82 @@
+import json
+import os
+import glob
+import pprint
+import re
+from datetime import datetime, timezone
+
+import click
+from colorama import Fore
+from huggingface_hub import HfApi, snapshot_download
+from huggingface_hub.hf_api import ModelInfo
+
+API = HfApi()
+
+
+def get_model_size(model_info: ModelInfo, precision: str):
+    size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
+    try:
+        model_size = round(model_info.safetensors["total"] / 1e9, 3)
+    except (AttributeError, TypeError ):
+        try:
+            size_match = re.search(size_pattern, model_info.modelId.split("/")[-1].lower())
+            model_size = size_match.group(0)
+            model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
+        except AttributeError:
+            return 0  # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
+
+    size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.split("/")[-1].lower()) else 1
+    model_size = size_factor * model_size
+    return model_size
+
+
+def update_request_files(requests_path):
+    request_files = os.path.join(
+        requests_path, "*/*.json"
+    )
+    request_files = glob.glob(request_files)
+
+    request_files = sorted(request_files, reverse=True)
+    for tmp_request_file in request_files:
+        with open(tmp_request_file, "r") as f:
+            req_content = json.load(f)
+            new_req_content = add_model_info(req_content)
+        
+        # if new content is different, update the file
+        if new_req_content != req_content:
+            with open(tmp_request_file, "w") as f:
+                f.write(json.dumps(new_req_content, indent=4))
+
+def add_model_info(entry):
+
+    model = entry["model"]
+    revision = entry["revision"]
+
+    try:
+        model_info = API.model_info(repo_id=model, revision=revision)
+    except Exception:
+        print(f"Could not get model information for {model} revision {revision}")
+        return entry
+
+    new_entry = entry.copy()
+
+    model_size = get_model_size(model_info=model_info, precision='float16')
+    new_entry["params"] = model_size
+
+    new_entry["likes"] = model_info.likes
+
+    # Were the model card and license filled?
+    try:
+        license = model_info.cardData["license"]
+        new_entry["license"] = license
+    except Exception:
+        print(f"No license for {model} revision {revision}")
+
+    print(json.dumps(new_entry, indent=4))
+    return new_entry
+
+
+if __name__ == "__main__":
+    # update_request_files("/Users/sean/workspace/leaderboard/leaderboard-test-requests")
+    update_request_files("/Volumes/Data-case-sensitive/requests")
+    
\ No newline at end of file
diff --git a/src/__pycache__/envs.cpython-310.pyc b/src/__pycache__/envs.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f25811fd5a115e2825ce938713c0932d7f7219cf
Binary files /dev/null and b/src/__pycache__/envs.cpython-310.pyc differ
diff --git a/src/__pycache__/populate.cpython-310.pyc b/src/__pycache__/populate.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b759c2ce988d45325e00cb9bcabc4b03722d7a3
Binary files /dev/null and b/src/__pycache__/populate.cpython-310.pyc differ
diff --git a/src/display/__pycache__/about.cpython-310.pyc b/src/display/__pycache__/about.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7b1487ef8761d7c5372b926ff561ee50e2597545
Binary files /dev/null and b/src/display/__pycache__/about.cpython-310.pyc differ
diff --git a/src/display/__pycache__/css_html_js.cpython-310.pyc b/src/display/__pycache__/css_html_js.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1545b916cd62f47006792d16b980b4bdebdbcedd
Binary files /dev/null and b/src/display/__pycache__/css_html_js.cpython-310.pyc differ
diff --git a/src/display/__pycache__/formatting.cpython-310.pyc b/src/display/__pycache__/formatting.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cee5c180af060ff169f5bf7e37a2ff38b587a185
Binary files /dev/null and b/src/display/__pycache__/formatting.cpython-310.pyc differ
diff --git a/src/display/__pycache__/utils.cpython-310.pyc b/src/display/__pycache__/utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7eb97b3a373d80b7475d24d488453623ac502d19
Binary files /dev/null and b/src/display/__pycache__/utils.cpython-310.pyc differ
diff --git a/src/display/about.py b/src/display/about.py
new file mode 100755
index 0000000000000000000000000000000000000000..072a9ebc699e8dc73ca6d314791b31b1d2e2598c
--- /dev/null
+++ b/src/display/about.py
@@ -0,0 +1,84 @@
+from src.display.utils import ModelType
+
+
+TITLE = """<img src="https://i.postimg.cc/250G53CJ/src-display-SIL-logo.png" style="width:30%;display:block;margin-left:auto;margin-right:auto">"""
+
+INTRODUCTION_TEXT = f"""
+Welcome to the Self-Improving Leaderboard (SIL) - A Revolutionary Platform for Evaluating Large Language Models
+The SIL offers a dynamic approach to assessing and ranking open-source LLMs and chatbots. Our innovative system continuously updates test datasets and recalculates rankings daily, ensuring evaluations reflect the rapid evolution of language processing capabilities.
+Key Features:
+• Daily-refreshed test datasets
+• Adaptive ranking system
+• Real-world language processing challenges
+• Comprehensive model performance insights
+Explore our cutting-edge evaluation process, gain deep insights into model capabilities, and see how different LLMs compare in this ever-changing landscape.
+Ready to participate? Submit your model for evaluation on the 'Submit' page and join the forefront of LLM advancement. For a detailed look at our methodology, visit the 'About' page.
+The SIL is proudly developed and maintained by [Your Organization/Team Name]. Together, let's push the boundaries of language AI!
+"""
+
+LLM_BENCHMARKS_TEXT = f"""
+# How it works
+🔄 The Self-Improving Leaderboard (SIL) operates on a dynamic evaluation system that continuously evolves to reflect real-world language processing challenges. Here's an overview of our process:
+Daily Dataset Refresh
+Our system generates new test data daily from diverse, reputable sources.
+Advanced Large Language Models (LLMs) are utilized to synthesize additional relevant content.
+The dataset is divided into two sections:
+A primary dataset maintaining the integrity of sourced data
+A noise-injected dataset simulating real-world data complexities
+Model Evaluation
+Participating models are rigorously evaluated against the refreshed dataset every 24 hours.
+We employ a comprehensive set of metrics aligned with industry-standard benchmarks.
+Our evaluation framework is built on the Eleuther AI Language Model Evaluation Harness, ensuring a robust and consistent assessment.
+Ranking System
+Model rankings are updated daily based on their performance across various tasks.
+The leaderboard reflects not only the latest scores but also tracks consistency and adaptability over time.
+Quarterly Comprehensive Evaluation
+Every three months, we conduct an in-depth analysis of model performance.
+This evaluation considers long-term trends, adaptability to evolving data, and overall efficacy.
+Special recognition (e.g., medals or badges) may be awarded based on sustained excellence.
+By continuously refreshing our test data and evaluation criteria, SIL aims to provide a more accurate representation of model performance in real-world scenarios, driving innovation in the field of Natural Language Processing.
+
+## Icons
+{ModelType.PT.to_str(" : ")} model
+{ModelType.IFT.to_str(" : ")} model
+{ModelType.RL.to_str(" : ")} model
+If there is no icon, it indicates that there is insufficient information about the model.
+Please provide information about the model through an issue! 🤩
+
+## Details and Logs
+- Detailed numerical results in the `results` dataset: https://huggingface.co/datasets/junkim100/SIL_results
+- Community queries and running status in the `requests` dataset: https://huggingface.co/datasets/junkim100/SIL_requests
+"""
+
+EVALUATION_QUEUE_TEXT = f"""
+# Evaluation Queue for the 🔄 Self-Improving Leaderboard
+
+## <Some good practices before submitting a model>
+
+### 1️⃣ Make sure you can load your model and tokenizer using AutoClasses
+```python
+from transformers import AutoConfig, AutoModel, AutoTokenizer
+config = AutoConfig.from_pretrained("your model name", revision=revision)
+model = AutoModel.from_pretrained("your model name", revision=revision)
+tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
+```
+
+If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
+
+⚠️ Make sure your model is public!
+
+⚠️ Maker sure your model runs with [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness)
+
+
+### 2️⃣ Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
+It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
+
+### 3️⃣ Make sure your model has an open license!
+We'd love for as many people as possible to know they can use your model
+
+### 4️⃣ Fill up your model card
+When we add extra information about models to the leaderboard, it will be automatically taken from the model card
+
+## In case of model failure
+If your model is displayed in the `FAILED` category, its execution stopped. Make sure you have followed the above steps first. If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
+"""
\ No newline at end of file
diff --git a/src/display/css_html_js.py b/src/display/css_html_js.py
new file mode 100755
index 0000000000000000000000000000000000000000..b812611116a6e5c963994fc138a38a04a011bdce
--- /dev/null
+++ b/src/display/css_html_js.py
@@ -0,0 +1,84 @@
+custom_css = """
+/* Hides the final AutoEvalColumn */
+#llm-benchmark-tab-table table td:last-child,
+#llm-benchmark-tab-table table th:last-child {
+    display: none;
+}
+
+/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
+table td:first-child,
+table th:first-child {
+    max-width: 400px;
+    overflow: auto;
+    white-space: nowrap;
+}
+
+/* Full width space */
+.gradio-container {
+  max-width: 95%!important;
+}
+
+/* Text style and margins */
+.markdown-text {
+    font-size: 16px !important;
+}
+
+#models-to-add-text {
+    font-size: 18px !important;
+}
+
+#search-bar-table-box > div:first-child {
+    background: none;
+    border: none;
+}
+ 
+#search-bar {
+    padding: 0px;
+}
+
+.tab-buttons button {
+    font-size: 20px;
+}
+
+/* Filters style */
+#filter_type{
+    border: 0;
+    padding-left: 0;
+    padding-top: 0;
+}
+#filter_type label {
+    display: flex;
+}
+#filter_type label > span{
+    margin-top: var(--spacing-lg);
+    margin-right: 0.5em;
+}
+#filter_type label > .wrap{
+    width: 103px;
+}
+#filter_type label > .wrap .wrap-inner{  
+    padding: 2px;
+}
+#filter_type label > .wrap .wrap-inner input{
+    width: 1px
+}
+#filter-columns-type{
+    border:0;
+    padding:0.5;
+}
+#filter-columns-size{
+    border:0;
+    padding:0.5;
+}
+#box-filter > .form{
+    border: 0
+}
+"""
+
+get_window_url_params = """
+    function(url_params) {
+        const params = new URLSearchParams(window.location.search);
+        url_params = Object.fromEntries(params);
+        return url_params;
+    }
+    """
diff --git a/src/display/formatting.py b/src/display/formatting.py
new file mode 100755
index 0000000000000000000000000000000000000000..9b0b52ec315cc9725738adec26332b05a9826bc9
--- /dev/null
+++ b/src/display/formatting.py
@@ -0,0 +1,40 @@
+import os
+from datetime import datetime, timezone
+
+from huggingface_hub import HfApi
+from huggingface_hub.hf_api import ModelInfo
+
+
+API = HfApi()
+
+def model_hyperlink(link, model_name):
+    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+
+
+def make_clickable_model(model_name):
+    link = f"https://huggingface.co/{model_name}"
+
+    details_model_name = model_name.replace("/", "__")
+    details_link = f"https://huggingface.co/datasets/open-ko-llm-leaderboard/details_{details_model_name}"
+
+    return model_hyperlink(link, model_name) + "  " + model_hyperlink(details_link, "📑")
+
+
+def styled_error(error):
+    return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
+
+
+def styled_warning(warn):
+    return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
+
+
+def styled_message(message):
+    return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
+
+
+def has_no_nan_values(df, columns):
+    return df[columns].notna().all(axis=1)
+
+
+def has_nan_values(df, columns):
+    return df[columns].isna().any(axis=1)
diff --git a/src/display/utils.py b/src/display/utils.py
new file mode 100755
index 0000000000000000000000000000000000000000..166001f47d46eb20fda526c0ccf2346db7226588
--- /dev/null
+++ b/src/display/utils.py
@@ -0,0 +1,164 @@
+from dataclasses import dataclass, make_dataclass
+from enum import Enum
+
+import pandas as pd
+
+def fields(raw_class):
+    return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
+
+
+@dataclass
+class Task:
+    benchmark: str
+    metric: str
+    col_name: str
+
+class Tasks(Enum):
+    arc = Task("arc_challenge", "acc_norm", "ARC")
+    hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
+    mmlu = Task("mmlu", "acc", "MMLU")
+    truthfulqa = Task("truthfulqa_mc", "mc2", "TruthfulQA")
+    # winogrande = Task("winogrande", "acc_norm", "Winogrande")
+    # gsm8k = Task("gsm8k", "acc_norm", "GSM8k")
+    commongen_v2 = Task("commongen_v2", "acc_norm", "CommonGen V2")
+    # eqBench = Task("eq_bench", "acc_norm", "EQ Bench")
+    # instFollow = Task("inst_follow", "acc_norm", "InstFollow")
+    # harmlessness = Task("harmlessness", "acc_norm", "Harmlessness")
+    # helpfulness = Task("helpfulness", "acc_norm", "Helpfulness")
+
+class Ranks(Enum):
+    daily = Task("daily", "daily", "Daily Rank")
+    quarterly = Task("quarterly", "quarterly", "Quarterly Rank")
+
+
+# These classes are for user facing column names,
+# to avoid having to change them all around the code
+# when a modif is needed
+@dataclass
+class ColumnContent:
+    name: str
+    type: str
+    displayed_by_default: bool
+    hidden: bool = False
+    never_hidden: bool = False
+    dummy: bool = False
+
+auto_eval_column_dict = []
+# Init
+auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
+auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
+# Ranks
+auto_eval_column_dict.append(["daily", ColumnContent, ColumnContent("Daily Rank", "number", True)])
+auto_eval_column_dict.append(["quarterly", ColumnContent, ColumnContent("Quarterly Rank", "number", True)])
+# Scores
+auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
+for task in Tasks:
+    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
+# Model information
+auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
+auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
+auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
+auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
+auto_eval_column_dict.append(["merged", ColumnContent, ColumnContent("Merged", "bool", False)])
+auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
+auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
+auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
+auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
+auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
+auto_eval_column_dict.append(["flagged", ColumnContent, ColumnContent("Flagged", "bool", False, False)])
+# Dummy column for the search bar (hidden by the custom CSS)
+auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
+
+# We use make dataclass to dynamically fill the scores from Tasks
+AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
+
+
+@dataclass(frozen=True)
+class EvalQueueColumn:  # Queue column
+    model = ColumnContent("model", "markdown", True)
+    revision = ColumnContent("revision", "str", True)
+    private = ColumnContent("private", "bool", True)
+    precision = ColumnContent("precision", "str", True)
+    weight_type = ColumnContent("weight_type", "str", "Original")
+    status = ColumnContent("status", "str", True)
+
+# Define the human baselines
+human_baseline_row = {
+    AutoEvalColumn.model.name: "<p>Human performance</p>",
+}
+
+@dataclass
+class ModelDetails:
+    name: str
+    symbol: str = "" # emoji, only for the model type
+
+
+class ModelType(Enum):
+    PT = ModelDetails(name="pretrained", symbol="🟢")
+    # FT = ModelDetails(name="fine-tuned", symbol="🔶")
+    IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
+    RL = ModelDetails(name="RL-tuned", symbol="🟦")
+    Unknown = ModelDetails(name="", symbol="?")
+
+    def to_str(self, separator=" "):
+        return f"{self.value.symbol}{separator}{self.value.name}"
+
+    @staticmethod
+    def from_str(type):
+        # if "fine-tuned" in type or "🔶" in type:
+        #     return ModelType.FT
+        if "pretrained" in type or "🟢" in type:
+            return ModelType.PT
+        if "RL-tuned" in type or "🟦" in type:
+            return ModelType.RL
+        if "instruction-tuned" in type or "⭕" in type:
+            return ModelType.IFT
+        return ModelType.Unknown
+
+class WeightType(Enum):
+    Adapter = ModelDetails("Adapter")
+    Original = ModelDetails("Original")
+    Delta = ModelDetails("Delta")
+
+class Precision(Enum):
+    float16 = ModelDetails("float16")
+    # bfloat16 = ModelDetails("bfloat16")
+    # qt_8bit = ModelDetails("8bit")
+    # qt_4bit = ModelDetails("4bit")
+    # qt_GPTQ = ModelDetails("GPTQ")
+    Unknown = ModelDetails("?")
+
+    def from_str(precision):
+        if precision in ["torch.float16", "float16"]:
+            return Precision.float16
+        if precision in ["8bit"]:
+            return Precision.qt_8bit
+        if precision in ["4bit"]:
+            return Precision.qt_4bit
+        if precision in ["GPTQ", "None"]:
+            return Precision.qt_GPTQ
+        return Precision.Unknown
+        
+
+
+
+# Column selection
+COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
+TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
+COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
+TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
+
+EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
+EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
+
+BENCHMARK_COLS = [t.value.col_name for t in Tasks]
+
+NUMERIC_INTERVALS = {
+    "Unknown": pd.Interval(-1, 0, closed="right"),
+    "0~3B": pd.Interval(0, 3, closed="right"),
+    "3~7B": pd.Interval(3, 7.3, closed="right"),
+    "7~13B": pd.Interval(7.3, 13, closed="right"),
+    "13~35B": pd.Interval(13, 35, closed="right"),
+    "35~60B": pd.Interval(35, 60, closed="right"),
+    "60B+": pd.Interval(60, 10000, closed="right"),
+}
diff --git a/src/envs.py b/src/envs.py
new file mode 100755
index 0000000000000000000000000000000000000000..a3f4f73977ab2e88b4c99c4aaeebb73aa9f210f4
--- /dev/null
+++ b/src/envs.py
@@ -0,0 +1,32 @@
+import os
+
+from huggingface_hub import HfApi
+
+# clone / pull the lmeh eval data
+H4_TOKEN = os.environ.get("H4_TOKEN", None)
+
+REPO_ID = "junkim100/self-improving-leaderboard"
+QUEUE_REPO = "junkim100/SIL_requests"
+RESULTS_REPO = "junkim100/SIL_results"
+
+PRIVATE_QUEUE_REPO = "open-ko-llm-leaderboard/private-requests"
+PRIVATE_RESULTS_REPO = "open-ko-llm-leaderboard/private-results"
+
+IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
+
+CACHE_PATH=os.getenv("HF_HOME", ".")
+
+EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
+EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
+
+EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
+EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
+
+PATH_TO_COLLECTION = "open-ko-llm-leaderboard/ko-llm-leaderboard-best-models-659c7e45a481ceea4c883506"
+
+# Rate limit variables
+RATE_LIMIT_PERIOD = 7
+RATE_LIMIT_QUOTA = 5
+HAS_HIGHER_RATE_LIMIT = []
+
+API = HfApi(token=H4_TOKEN)
diff --git a/src/leaderboard/__pycache__/filter_models.cpython-310.pyc b/src/leaderboard/__pycache__/filter_models.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7500ee03c42db8f02506351ae84b64e5e93508b0
Binary files /dev/null and b/src/leaderboard/__pycache__/filter_models.cpython-310.pyc differ
diff --git a/src/leaderboard/__pycache__/read_evals.cpython-310.pyc b/src/leaderboard/__pycache__/read_evals.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ca2d5dba4b960f6b5a8718f33138bc6f99aa980a
Binary files /dev/null and b/src/leaderboard/__pycache__/read_evals.cpython-310.pyc differ
diff --git a/src/leaderboard/filter_models.py b/src/leaderboard/filter_models.py
new file mode 100755
index 0000000000000000000000000000000000000000..95ea9e5214ef507a41e512052ec4a27dae7dc9a4
--- /dev/null
+++ b/src/leaderboard/filter_models.py
@@ -0,0 +1,51 @@
+from src.display.formatting import model_hyperlink
+from src.display.utils import AutoEvalColumn
+
+# Models which have been flagged by users as being problematic for a reason or another
+# (Model name to forum discussion link)
+FLAGGED_MODELS = {
+    "merged": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "TeamUNIVA/Komodo_7B_v0.1.0": "https://huggingface.co/spaces/upstage/open-ko-llm-leaderboard/discussions/44",
+}
+
+# Models which have been requested by orgs to not be submitted on the leaderboard
+DO_NOT_SUBMIT_MODELS = [
+]
+
+
+def flag_models(leaderboard_data: list[dict]):
+    for model_data in leaderboard_data:
+        # Merges are flagged automatically
+        if model_data[AutoEvalColumn.flagged.name] == True:
+            flag_key = "merged"
+        else:
+            flag_key = model_data["model_name_for_query"]
+
+        if flag_key in FLAGGED_MODELS:
+            issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
+            issue_link = model_hyperlink(
+                FLAGGED_MODELS[flag_key],
+                f"See discussion #{issue_num}",
+            )
+            model_data[
+                AutoEvalColumn.model.name
+            ] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
+            model_data[AutoEvalColumn.flagged.name] = True
+        else:
+            model_data[AutoEvalColumn.flagged.name] = False
+
+
+def remove_forbidden_models(leaderboard_data: list[dict]):
+    indices_to_remove = []
+    for ix, model in enumerate(leaderboard_data):
+        if model["model_name_for_query"] in DO_NOT_SUBMIT_MODELS:
+            indices_to_remove.append(ix)
+
+    for ix in reversed(indices_to_remove):
+        leaderboard_data.pop(ix)
+    return leaderboard_data
+
+
+def filter_models(leaderboard_data: list[dict]):
+    leaderboard_data = remove_forbidden_models(leaderboard_data)
+    flag_models(leaderboard_data)
diff --git a/src/leaderboard/read_evals.py b/src/leaderboard/read_evals.py
new file mode 100755
index 0000000000000000000000000000000000000000..bdba3a3d90c40cfc647d55becadd6ebc568c3aad
--- /dev/null
+++ b/src/leaderboard/read_evals.py
@@ -0,0 +1,272 @@
+import glob
+import json
+import math
+import os
+from dataclasses import dataclass
+
+import dateutil
+import numpy as np
+
+from huggingface_hub import ModelCard
+
+from src.display.formatting import make_clickable_model
+from src.display.utils import AutoEvalColumn, ModelType, Tasks, Ranks, Precision, WeightType
+from src.submission.check_validity import is_model_on_hub, check_model_card
+
+
+@dataclass
+class EvalResult:
+    # Also see src.display.utils.AutoEvalColumn for what will be displayed.
+    eval_name: str # org_model_precision (uid)
+    full_model: str # org/model (path on hub)
+    org: str 
+    model: str
+    revision: str # commit hash, "" if main
+    results: dict
+    precision: Precision = Precision.Unknown
+    model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
+    weight_type: WeightType = WeightType.Original # Original or Adapter
+    architecture: str = "Unknown" # From config file
+    license: str = "?"
+    likes: int = 0
+    num_params: int = 0
+    date: str = "" # submission date of request file
+    still_on_hub: bool = False
+    is_merge: bool = False
+    flagged: bool = False
+
+    @classmethod
+    def init_from_json_file(self, json_filepath):
+        """Inits the result from the specific model result file"""
+        with open(json_filepath) as fp:
+            data = json.load(fp)
+
+        # We manage the legacy config format
+        config = data.get("config", data.get("config_general", None))
+
+        # Precision
+        precision = Precision.from_str(config.get("model_dtype"))
+
+        # Get model and org
+        org_and_model = config.get("model_name", config.get("model_args", None))
+        org_and_model = org_and_model.split("/", 1)
+
+        if len(org_and_model) == 1:
+            org = None
+            model = org_and_model[0]
+            result_key = f"{model}_{precision.value.name}"
+        else:
+            org = org_and_model[0]
+            model = org_and_model[1]
+            result_key = f"{org}_{model}_{precision.value.name}"
+        full_model = "/".join(org_and_model)
+
+        still_on_hub, error, model_config = is_model_on_hub(
+            full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
+        )
+        architecture = "?"
+        if model_config is not None:
+            architectures = getattr(model_config, "architectures", None)
+            if architectures:
+                architecture = ";".join(architectures)
+
+        # If the model doesn't have a model card or a license, we consider it's deleted
+        if still_on_hub:
+            try:
+                if check_model_card(full_model)[0] is False:
+                    still_on_hub = False
+            except Exception:
+                still_on_hub = False
+
+        # Check if the model is a merge
+        is_merge_from_metadata = False
+        flagged = False
+        if still_on_hub:
+            model_card = ModelCard.load(full_model)
+
+            if model_card.data.tags:
+                is_merge_from_metadata = "merge" in model_card.data.tags
+            merge_keywords = ["mergekit", "merged model", "merge model", "merging", "Carbon"]
+            # If the model is a merge but not saying it in the metadata, we flag it
+            is_merge_from_model_card = any(keyword in model_card.text.lower() for keyword in merge_keywords)
+            flagged = is_merge_from_model_card and not is_merge_from_metadata
+
+
+        # Extract results available in this file (some results are split in several files)
+        results = {}
+        for rank in Ranks:
+            rank = rank.value
+            if rank.benchmark in data["results"]:
+                results[rank.benchmark] = data["results"][rank.benchmark][rank.metric]
+        for task in Tasks:
+            task = task.value
+
+            # Some truthfulQA values are NaNs
+            if task.benchmark == "truthfulqa:mc" and "harness|truthfulqa:mc|0" in data["results"]:
+                if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
+                    results[task.benchmark] = 0.0
+                    continue
+
+            # New tasks have been added, we need to skip them if not exists
+            if task.benchmark in ["winogrande", "gsm8k", "eq_bench", "inst_follow", "harmlessness", "helpfulness"]:
+                accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
+                if accs.size == 0 or any([acc is None for acc in accs]):
+                    results[task.benchmark] = 0.0
+                    continue
+
+            # We average all scores of a given metric (mostly for mmlu)
+            accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
+            if accs.size == 0 or any([acc is None for acc in accs]):
+                continue
+
+            mean_acc = np.mean(accs) * 100.0
+            results[task.benchmark] = mean_acc
+
+        return self(
+            eval_name=result_key,
+            full_model=full_model,
+            org=org,
+            model=model,
+            results=results,
+            precision=precision,  
+            revision= config.get("model_sha", ""),
+            still_on_hub=still_on_hub,
+            architecture=architecture,
+            is_merge=is_merge_from_metadata,
+            flagged=flagged,
+        )
+
+    def update_with_request_file(self, requests_path):
+        """Finds the relevant request file for the current model and updates info with it"""
+        request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
+
+        try:
+            with open(request_file, "r") as f:
+                request = json.load(f)
+            self.model_type = ModelType.from_str(request.get("model_type", ""))
+            self.weight_type = WeightType[request.get("weight_type", "Original")]
+            self.license = request.get("license", "?")
+            self.likes = request.get("likes", 0)
+            self.num_params = request.get("params", 0)
+            self.date = request.get("submitted_time", "")
+        except Exception:
+            print(f"Could not find request file for {self.org}/{self.model}")
+
+    def to_dict(self):
+        """Converts the Eval Result to a dict compatible with our dataframe display"""
+
+        # Skip the new tasks for now
+        # TODO: safely remove this code when the task results are all added
+        skip_avg_len = 0
+        # if self.results['winogrande'] == 0.0:
+        #     skip_avg_len += 1
+        # if self.results['gsm8k'] == 0.0:
+        #     skip_avg_len += 1
+        # if self.results['eq_bench'] == 0.0:
+        #     skip_avg_len += 1
+        # if self.results['inst_follow'] == 0.0:
+        #     skip_avg_len += 1
+        # if self.results['harmlessness'] == 0.0:
+        #     skip_avg_len += 1
+        # if self.results['helpfulness'] == 0.0:
+        #     skip_avg_len += 1
+
+        average = sum([v for v in self.results.values() if v is not None]) / (len(Tasks) - skip_avg_len)
+
+        data_dict = {
+            "eval_name": self.eval_name,  # not a column, just a save name,
+            AutoEvalColumn.precision.name: self.precision.value.name,
+            AutoEvalColumn.model_type.name: self.model_type.value.name,
+            AutoEvalColumn.merged.name: self.is_merge,
+            AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol, # + "🥦" if self.is_merge,
+            AutoEvalColumn.weight_type.name: self.weight_type.value.name,
+            AutoEvalColumn.architecture.name: self.architecture,
+            AutoEvalColumn.model.name: make_clickable_model(self.full_model),
+            AutoEvalColumn.dummy.name: self.full_model,
+            AutoEvalColumn.revision.name: self.revision,
+            AutoEvalColumn.average.name: average,
+            AutoEvalColumn.license.name: self.license,
+            AutoEvalColumn.likes.name: self.likes,
+            AutoEvalColumn.params.name: self.num_params,
+            AutoEvalColumn.still_on_hub.name: self.still_on_hub,
+            AutoEvalColumn.flagged.name: self.flagged
+        }
+
+        AllColumns = []
+        for task in Tasks:
+            AllColumns.append(task.value)
+        for rank in Ranks:
+            AllColumns.append(rank.value)
+
+        for a in AllColumns:
+            if a.benchmark in ["daily", "quarterly"]:
+                data_dict[a.col_name] = self.results[a.benchmark]
+                print(a.benchmark, self.results[a.benchmark], a.col_name)
+            else:
+                data_dict[a.col_name] = self.results[a.benchmark]
+
+        return data_dict
+
+
+def get_request_file_for_model(requests_path, model_name, precision):
+    """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
+    request_files = os.path.join(
+        requests_path,
+        f"{model_name}_eval_request_*.json",
+    )
+    request_files = glob.glob(request_files)
+
+    # Select correct request file (precision)
+    request_file = ""
+    request_files = sorted(request_files, reverse=True)
+    for tmp_request_file in request_files:
+        with open(tmp_request_file, "r") as f:
+            req_content = json.load(f)
+            if (
+                req_content["status"] in ["FINISHED"]
+                and req_content["precision"] == precision.split(".")[-1]
+            ):
+                request_file = tmp_request_file
+    return request_file
+
+
+def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
+    """From the path of the results folder root, extract all needed info for results"""
+    model_result_filepaths = []
+
+    for root, _, files in os.walk(results_path):
+        # We should only have json files in model results
+        if len(files) == 0 or any([not f.endswith(".json") for f in files]):
+            continue
+
+        # Sort the files by date
+        try:
+            files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
+        except dateutil.parser._parser.ParserError:
+            files = [files[-1]]
+
+        for file in files:
+            model_result_filepaths.append(os.path.join(root, file))
+
+    eval_results = {}
+    for model_result_filepath in model_result_filepaths:
+        # Creation of result
+        eval_result = EvalResult.init_from_json_file(model_result_filepath)
+        eval_result.update_with_request_file(requests_path)
+
+        # Store results of same eval together
+        eval_name = eval_result.eval_name
+        if eval_name in eval_results.keys():
+            eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
+        else:
+            eval_results[eval_name] = eval_result
+
+    results = []
+    for v in eval_results.values():
+        try:
+            v.to_dict() # we test if the dict version is complete
+            results.append(v)
+        except KeyError:  # not all eval values present
+            continue
+
+    return results
diff --git a/src/populate.py b/src/populate.py
new file mode 100755
index 0000000000000000000000000000000000000000..df32a6311e2305ecb14eef8cf2feda655a261bab
--- /dev/null
+++ b/src/populate.py
@@ -0,0 +1,70 @@
+import json
+import os
+
+import pandas as pd
+
+from src.display.formatting import has_no_nan_values, make_clickable_model
+from src.display.utils import AutoEvalColumn, EvalQueueColumn
+from src.leaderboard.filter_models import filter_models
+from src.leaderboard.read_evals import get_raw_eval_results
+
+
+def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
+    raw_data = get_raw_eval_results(results_path, requests_path)
+    all_data_json = [v.to_dict() for v in raw_data]
+    # all_data_json.append(baseline_row)
+    filter_models(all_data_json)
+
+    df = pd.DataFrame.from_records(all_data_json)
+    # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
+    df = df.sort_values(by=["Daily Rank"], ascending=True)
+
+    # print(df[AutoEvalColumn.average.name])
+    try:
+        df = df[cols].round(decimals=2)
+    except:
+        pass
+
+    # filter out if any of the benchmarks have not been produced
+    try:
+        df = df[has_no_nan_values(df, benchmark_cols)]
+    except:
+        pass
+    return raw_data, df
+
+
+def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
+    entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
+    all_evals = []
+
+    for entry in entries:
+        if ".json" in entry:
+            file_path = os.path.join(save_path, entry)
+            with open(file_path) as fp:
+                data = json.load(fp)
+
+            data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
+            data[EvalQueueColumn.revision.name] = data.get("revision", "main")
+
+            all_evals.append(data)
+        elif ".md" not in entry:
+            # this is a folder
+            sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
+            for sub_entry in sub_entries:
+                file_path = os.path.join(save_path, entry, sub_entry)
+                with open(file_path) as fp:
+                    data = json.load(fp)
+
+                data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
+                data[EvalQueueColumn.revision.name] = data.get("revision", "main")
+                all_evals.append(data)
+
+    pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
+    running_list = [e for e in all_evals if e["status"] == "RUNNING"]
+    finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
+    failed_list = [e for e in all_evals if e["status"] == "FAILED"]
+    df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
+    df_running = pd.DataFrame.from_records(running_list, columns=cols)
+    df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
+    df_failed = pd.DataFrame.from_records(failed_list, columns=cols)
+    return df_finished[cols], df_running[cols], df_pending[cols], df_failed[cols]
diff --git a/src/submission/__pycache__/check_validity.cpython-310.pyc b/src/submission/__pycache__/check_validity.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..414d55f4b552402359607afd13cb0ca8696bcc75
Binary files /dev/null and b/src/submission/__pycache__/check_validity.cpython-310.pyc differ
diff --git a/src/submission/__pycache__/submit.cpython-310.pyc b/src/submission/__pycache__/submit.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c78822770039cb8f07a97c179a7fb6b2c258b748
Binary files /dev/null and b/src/submission/__pycache__/submit.cpython-310.pyc differ
diff --git a/src/submission/check_validity.py b/src/submission/check_validity.py
new file mode 100755
index 0000000000000000000000000000000000000000..aba20a605b10aa4418703cad0345a3c0091fe1b3
--- /dev/null
+++ b/src/submission/check_validity.py
@@ -0,0 +1,129 @@
+import json
+import os
+import re
+from collections import defaultdict
+from datetime import datetime, timedelta, timezone
+
+import huggingface_hub
+from huggingface_hub import ModelCard
+from huggingface_hub.hf_api import ModelInfo
+from transformers import AutoConfig, AutoTokenizer
+
+from src.envs import HAS_HIGHER_RATE_LIMIT
+
+
+# ht to @Wauplin, thank you for the snippet!
+# See https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/317
+def check_model_card(repo_id: str) -> tuple[bool, str]:
+    # Returns operation status, and error message
+    try:
+        card = ModelCard.load(repo_id)
+    except huggingface_hub.utils.EntryNotFoundError:
+        return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
+
+    # Enforce license metadata
+    if card.data.license is None:
+        if not ("license_name" in card.data and "license_link" in card.data):
+            return False, (
+                "License not found. Please add a license to your model card using the `license` metadata or a"
+                " `license_name`/`license_link` pair."
+            )
+
+    # Enforce card content
+    if len(card.text) < 200:
+        return False, "Please add a description to your model card, it is too short."
+
+    return True, ""
+
+
+def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
+    try:
+        config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token) #, force_download=True)
+        if test_tokenizer:
+            try:
+                tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
+            except ValueError as e:
+                return (
+                    False,
+                    f"uses a tokenizer which is not in a transformers release: {e}",
+                    None
+                )
+            except Exception as e:
+                return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
+        return True, None, config
+
+    except ValueError:
+        return (
+            False,
+            "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
+            None
+        )
+
+    except Exception as e:
+        return False, "was not found on hub!", None
+
+
+def get_model_size(model_info: ModelInfo, precision: str):
+    size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
+    try:
+        model_size = round(model_info.safetensors["total"] / 1e9, 3)
+    except (AttributeError, TypeError ):
+        try:
+            size_match = re.search(size_pattern, model_info.modelId.split("/")[-1].lower())
+            model_size = size_match.group(0)
+            model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
+        except AttributeError:
+            return 0  # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
+
+    size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.split("/")[-1].lower()) else 1
+    model_size = size_factor * model_size
+    return model_size
+
+def get_model_arch(model_info: ModelInfo):
+    return model_info.config.get("architectures", "Unknown")
+
+def user_submission_permission(org_or_user, users_to_submission_dates, rate_limit_period, rate_limit_quota):
+    if org_or_user not in users_to_submission_dates:
+        return True, ""
+    submission_dates = sorted(users_to_submission_dates[org_or_user])
+
+    time_limit = (datetime.now(timezone.utc) - timedelta(days=rate_limit_period)).strftime("%Y-%m-%dT%H:%M:%SZ")
+    submissions_after_timelimit = [d for d in submission_dates if d > time_limit]
+
+    num_models_submitted_in_period = len(submissions_after_timelimit)
+    if org_or_user in HAS_HIGHER_RATE_LIMIT:
+        rate_limit_quota = 2 * rate_limit_quota
+
+    if num_models_submitted_in_period > rate_limit_quota:
+        error_msg = f"Organisation or user `{org_or_user}`"
+        error_msg += f"already has {num_models_submitted_in_period} model requests submitted to the leaderboard "
+        error_msg += f"in the last {rate_limit_period} days.\n"
+        error_msg += (
+            "Please wait a couple of days before resubmitting, so that everybody can enjoy using the leaderboard 🤗"
+        )
+        return False, error_msg
+    return True, ""
+
+
+def already_submitted_models(requested_models_dir: str) -> set[str]:
+    depth = 1
+    file_names = []
+    users_to_submission_dates = defaultdict(list)
+
+    for root, _, files in os.walk(requested_models_dir):
+        current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
+        if current_depth == depth:
+            for file in files:
+                if not file.endswith(".json"):
+                    continue
+                with open(os.path.join(root, file), "r") as f:
+                    info = json.load(f)
+                    file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
+
+                    # Select organisation
+                    if info["model"].count("/") == 0 or "submitted_time" not in info:
+                        continue
+                    organisation, _ = info["model"].split("/")
+                    users_to_submission_dates[organisation].append(info["submitted_time"])
+
+    return set(file_names), users_to_submission_dates
diff --git a/src/submission/submit.py b/src/submission/submit.py
new file mode 100755
index 0000000000000000000000000000000000000000..5ce81c0df846365e4580fe7f59261cfe8002d9ea
--- /dev/null
+++ b/src/submission/submit.py
@@ -0,0 +1,143 @@
+import json
+import os
+from datetime import datetime, timezone
+
+from src.display.formatting import styled_error, styled_message, styled_warning
+from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
+from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
+from src.submission.check_validity import (
+    already_submitted_models,
+    check_model_card,
+    get_model_size,
+    is_model_on_hub,
+    user_submission_permission,
+)
+
+REQUESTED_MODELS = None
+USERS_TO_SUBMISSION_DATES = None
+
+def add_new_eval(
+    model: str,
+    base_model: str,
+    revision: str,
+    precision: str,
+    private: bool,
+    weight_type: str,
+    model_type: str,
+):
+    global REQUESTED_MODELS
+    global USERS_TO_SUBMISSION_DATES
+    if not REQUESTED_MODELS:
+        REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
+
+    user_name = ""
+    model_path = model
+    if "/" in model:
+        user_name = model.split("/")[0]
+        model_path = model.split("/")[1]
+
+    precision = precision.split(" ")[0]
+    current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+
+    if model_type is None or model_type == "":
+        return styled_error("Please select a model type.")
+
+    # Upstage models are now allowed to be submitted to ensure the transparency and fairness of the leaderboard.
+    if user_name == "upstage":
+        return styled_warning("We do not conduct evaluations on Upstage models to ensure the transparency and fairness of the leaderboard. Please take this into consideration.")
+
+    # Is the user rate limited?
+    if user_name != "":
+        user_can_submit, error_msg = user_submission_permission(
+            user_name, USERS_TO_SUBMISSION_DATES, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
+        )
+        if not user_can_submit:
+            return styled_error(error_msg)
+
+    # Did the model authors forbid its submission to the leaderboard?
+    if model in DO_NOT_SUBMIT_MODELS or base_model in DO_NOT_SUBMIT_MODELS:
+        return styled_warning("Model authors have requested that their model be not submitted on the leaderboard.")
+
+    # Does the model actually exist?
+    if revision == "":
+        revision = "main"
+
+    # Is the model on the hub?
+    if weight_type in ["Delta", "Adapter"]:
+        base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=H4_TOKEN, test_tokenizer=True)
+        if not base_model_on_hub:
+            return styled_error(f'Base model "{base_model}" {error}')
+
+    if not weight_type == "Adapter":
+        model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, test_tokenizer=True, trust_remote_code=True if model == "KT-AI/midm-bitext-S-7B-inst-v2" else False)
+        if not model_on_hub:
+            return styled_error(f'Model "{model}" {error}')
+
+    # Is the model info correctly filled?
+    try:
+        model_info = API.model_info(repo_id=model, revision=revision)
+    except Exception:
+        return styled_error("Could not get your model information. Please fill it up properly.")
+
+    model_size = get_model_size(model_info=model_info, precision=precision)
+
+    # Were the model size less than 30B?
+    if float(model_size) >= 30: 
+        return styled_error("Please submit a model that is less than 30B")
+
+    # Were the model card and license filled?
+    try:
+        license = model_info.cardData["license"]
+    except Exception:
+        return styled_error("Please select a license for your model")
+
+    modelcard_OK, error_msg = check_model_card(model)
+    if not modelcard_OK:
+        return styled_error(error_msg)
+
+    # Seems good, creating the eval
+    print("Adding new eval")
+
+    eval_entry = {
+        "model": model,
+        "base_model": base_model,
+        "revision": revision,
+        "private": private,
+        "precision": precision,
+        "weight_type": weight_type,
+        "status": "PENDING",
+        "submitted_time": current_time,
+        "model_type": model_type,
+        "likes": model_info.likes,
+        "params": model_size,
+        "license": license,
+    }
+
+    # Check for duplicate submission
+    if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
+        return styled_warning("This model has been already submitted.")
+
+    print("Creating eval file")
+    OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
+    os.makedirs(OUT_DIR, exist_ok=True)
+    out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
+
+    with open(out_path, "w") as f:
+        f.write(json.dumps(eval_entry))
+
+    print("Uploading eval file")
+    API.upload_file(
+        path_or_fileobj=out_path,
+        path_in_repo=out_path.split("eval-queue/")[1],
+        repo_id=QUEUE_REPO,
+        repo_type="dataset",
+        commit_message=f"Update status to PENDING.",
+    )
+
+    # Remove the local file
+    os.remove(out_path)
+
+    return styled_message(
+        "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
+    )
+
diff --git a/src/tools/__pycache__/collections.cpython-310.pyc b/src/tools/__pycache__/collections.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b4f21bd55aed9baed92a9ea8ff889d99f9cbf051
Binary files /dev/null and b/src/tools/__pycache__/collections.cpython-310.pyc differ
diff --git a/src/tools/__pycache__/plots.cpython-310.pyc b/src/tools/__pycache__/plots.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e3b8ae0e3feef7362ecf64dadaac3e5f923c5e44
Binary files /dev/null and b/src/tools/__pycache__/plots.cpython-310.pyc differ
diff --git a/src/tools/collections.py b/src/tools/collections.py
new file mode 100755
index 0000000000000000000000000000000000000000..cca2e0afa2ca9d5cd2f6820f1aed841c78f6bb9a
--- /dev/null
+++ b/src/tools/collections.py
@@ -0,0 +1,82 @@
+import os
+
+import pandas as pd
+from huggingface_hub import add_collection_item, delete_collection_item, get_collection, update_collection_item
+from huggingface_hub.utils._errors import HfHubHTTPError
+from pandas import DataFrame
+
+from src.display.utils import AutoEvalColumn, ModelType
+from src.envs import H4_TOKEN, PATH_TO_COLLECTION
+
+# Specific intervals for the collections
+intervals = {
+    "0~3B": pd.Interval(0, 3, closed="right"),
+    "3~7B": pd.Interval(3, 7.3, closed="right"),
+    "7~13B": pd.Interval(7.3, 13, closed="right"),
+    "13~35B": pd.Interval(13, 35, closed="right"),
+    "35~60B": pd.Interval(35, 60, closed="right"),
+    "60B+": pd.Interval(60, 10000, closed="right"),
+}
+
+def update_collections(df: DataFrame):
+    """This function updates the Open Ko LLM Leaderboard model collection with the latest best models for
+    each size category and type.
+    """
+    collection = get_collection(collection_slug=PATH_TO_COLLECTION, token=H4_TOKEN)
+    params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
+
+    cur_best_models = []
+
+    ix = 0
+    for type in ModelType:
+        if type.value.name == "":
+            continue
+        for size in intervals:
+            # We filter the df to gather the relevant models
+            type_emoji = [t[0] for t in type.value.symbol]
+            filtered_df = df[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
+
+            numeric_interval = pd.IntervalIndex([intervals[size]])
+            mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
+            filtered_df = filtered_df.loc[mask]
+
+            best_models = list(
+                filtered_df.sort_values(AutoEvalColumn.average.name, ascending=False)[AutoEvalColumn.dummy.name]
+            )
+            print(type.value.symbol, size, best_models[:10])
+
+            # We add them one by one to the leaderboard
+            for model in best_models:
+                ix += 1
+                cur_len_collection = len(collection.items)
+                try:
+                    collection = add_collection_item(
+                        PATH_TO_COLLECTION,
+                        item_id=model,
+                        item_type="model",
+                        exists_ok=True,
+                        note=f"Best {type.to_str(' ')} model of size {size} on the leaderboard today!",
+                        token=H4_TOKEN,
+                    )
+                    if (
+                        len(collection.items) > cur_len_collection
+                    ):  # we added an item - we make sure its position is correct
+                        item_object_id = collection.items[-1].item_object_id
+                        update_collection_item(
+                            collection_slug=PATH_TO_COLLECTION, item_object_id=item_object_id, position=ix
+                        )
+                        cur_len_collection = len(collection.items)
+                    cur_best_models.append(model)
+                    break
+                except HfHubHTTPError:
+                    continue
+
+    collection = get_collection(PATH_TO_COLLECTION, token=H4_TOKEN)
+    for item in collection.items:
+        if item.item_id not in cur_best_models:
+            try:
+                delete_collection_item(
+                    collection_slug=PATH_TO_COLLECTION, item_object_id=item.item_object_id, token=H4_TOKEN
+                )
+            except HfHubHTTPError:
+                continue
diff --git a/src/tools/model_backlinks.py b/src/tools/model_backlinks.py
new file mode 100755
index 0000000000000000000000000000000000000000..497422aed078b65099708880e788f6f442c41c4f
--- /dev/null
+++ b/src/tools/model_backlinks.py
@@ -0,0 +1,3 @@
+models = [
+    "baseline",
+]
diff --git a/src/tools/plots.py b/src/tools/plots.py
new file mode 100755
index 0000000000000000000000000000000000000000..57a55c6bf60a91c2193803adcfd1f11cc87ac555
--- /dev/null
+++ b/src/tools/plots.py
@@ -0,0 +1,168 @@
+import pandas as pd
+import numpy as np
+import plotly.express as px
+from plotly.graph_objs import Figure
+
+from src.leaderboard.filter_models import FLAGGED_MODELS
+from src.display.utils import human_baseline_row as HUMAN_BASELINE, AutoEvalColumn, Tasks, Task, BENCHMARK_COLS
+from src.leaderboard.read_evals import EvalResult
+
+
+
+def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
+    """
+    Generates a DataFrame containing the maximum scores until each date.
+
+    :param results_df: A DataFrame containing result information including metric scores and dates.
+    :return: A new DataFrame containing the maximum scores until each date for every metric.
+    """
+    # Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it
+    results_df = pd.DataFrame(raw_data)
+    #results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
+    results_df.sort_values(by="date", inplace=True)
+
+    # Step 2: Initialize the scores dictionary
+    scores = {k: [] for k in BENCHMARK_COLS + [AutoEvalColumn.average.name]}
+
+    # Step 3: Iterate over the rows of the DataFrame and update the scores dictionary
+    for task in [t.value for t in Tasks] + [Task("Average", "avg", AutoEvalColumn.average.name)]:
+        current_max = 0
+        last_date = ""
+        column = task.col_name
+        for _, row in results_df.iterrows():
+            current_model = row["full_model"]
+            if current_model in FLAGGED_MODELS:
+                continue
+
+            current_date = row["date"]
+            if task.benchmark == "Average":
+                avg_skip_len = 0
+                if row["results"]["winogrande"] == 0.0:
+                    avg_skip_len += 1
+                if row["results"]["gsm8k"] == 0.0:
+                    avg_skip_len += 1
+                if row["results"]["eq_bench"] == 0.0:
+                    avg_skip_len += 1
+                if row["results"]["inst_follow"] == 0.0:
+                    avg_skip_len += 1
+                if row["results"]["harmlessness"] == 0.0:
+                    avg_skip_len += 1
+                if row["results"]["helpfulness"] == 0.0:
+                    avg_skip_len += 1
+                    
+                current_score = np.sum(list(row["results"].values())) / (len(row["results"]) - avg_skip_len)
+            else:
+                current_score = row["results"][task.benchmark]
+
+            if current_score > current_max:
+                if current_date == last_date and len(scores[column]) > 0:
+                    scores[column][-1] = {"model": current_model, "date": current_date, "score": current_score}
+                else:
+                    scores[column].append({"model": current_model, "date": current_date, "score": current_score})
+                current_max = current_score
+                last_date = current_date
+
+    # Step 4: Return all dictionaries as DataFrames
+    return {k: pd.DataFrame(v) for k, v in scores.items()}
+
+
+def create_plot_df(scores_df: dict[str: pd.DataFrame]) -> pd.DataFrame:
+    """
+    Transforms the scores DataFrame into a new format suitable for plotting.
+
+    :param scores_df: A DataFrame containing metric scores and dates.
+    :return: A new DataFrame reshaped for plotting purposes.
+    """
+    # Initialize the list to store DataFrames
+    dfs = []
+
+    # Iterate over the cols and create a new DataFrame for each column
+    for col in BENCHMARK_COLS + [AutoEvalColumn.average.name]:
+        d = scores_df[col].reset_index(drop=True)
+        d["task"] = col
+        dfs.append(d)
+
+    # Concatenate all the created DataFrames
+    concat_df = pd.concat(dfs, ignore_index=True)
+
+    # Sort values by 'date'
+    concat_df.sort_values(by="date", inplace=True)
+    concat_df.reset_index(drop=True, inplace=True)
+    return concat_df
+
+
+def create_metric_plot_obj(
+    df: pd.DataFrame, metrics: list[str], title: str
+) -> Figure:
+    """
+    Create a Plotly figure object with lines representing different metrics
+    and horizontal dotted lines representing human baselines.
+
+    :param df: The DataFrame containing the metric values, names, and dates.
+    :param metrics: A list of strings representing the names of the metrics
+                    to be included in the plot.
+    :param title: A string representing the title of the plot.
+    :return: A Plotly figure object with lines representing metrics and
+             horizontal dotted lines representing human baselines.
+    """
+
+    # Filter the DataFrame based on the specified metrics
+    df = df[df["task"].isin(metrics)]
+
+    # Filter the human baselines based on the specified metrics
+    filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics}
+
+    # Create a line figure using plotly express with specified markers and custom data
+    fig = px.line(
+        df,
+        x="date",
+        y="score",
+        color="task",
+        markers=True,
+        custom_data=["task", "score", "model"],
+        title=title,
+    )
+
+    # Update hovertemplate for better hover interaction experience
+    fig.update_traces(
+        hovertemplate="<br>".join(
+            [
+                "Model Name: %{customdata[2]}",
+                "Metric Name: %{customdata[0]}",
+                "Date: %{x}",
+                "Metric Value: %{y}",
+            ]
+        )
+    )
+
+    # Update the range of the y-axis
+    fig.update_layout(yaxis_range=[0, 100])
+
+    # Create a dictionary to hold the color mapping for each metric
+    metric_color_mapping = {}
+
+    # Map each metric name to its color in the figure
+    for trace in fig.data:
+        metric_color_mapping[trace.name] = trace.line.color
+
+    # Iterate over filtered human baselines and add horizontal lines to the figure
+    for metric, value in filtered_human_baselines.items():
+        color = metric_color_mapping.get(metric, "blue")  # Retrieve color from mapping; default to blue if not found
+        location = "top left" if metric == "Ko-HellaSwag" else "bottom left"  # Set annotation position
+        # Add horizontal line with matched color and positioned annotation
+        fig.add_hline(
+            y=value,
+            line_dash="dot",
+            annotation_text=f"{metric} human baseline",
+            annotation_position=location,
+            annotation_font_size=10,
+            annotation_font_color=color,
+            line_color=color,
+        )
+
+    return fig
+
+
+# Example Usage:
+# human_baselines dictionary is defined.
+# chart = create_metric_plot_obj(scores_df, ["ARC", "HellaSwag", "MMLU", "TruthfulQA"], human_baselines, "Graph Title")