Adam Jirkovsky
Add aggregate scores and fix last column visibility
e3e7110
raw
history blame
15.9 kB
import json
import gradio as gr
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import snapshot_download
from src.display.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
EVALUATION_QUEUE_TEXT,
INTRODUCTION_TEXT,
LLM_BENCHMARKS_TEXT,
TITLE,
TABLE_DESC,
)
from src.display.css_html_js import custom_css
from src.display.formatting import styled_error, styled_message, styled_warning
from src.display.utils import (
BENCHMARK_COLS,
COLS,
EVAL_COLS,
EVAL_TYPES,
NUMERIC_INTERVALS,
TYPES,
TYPES_LITE,
AutoEvalColumn,
ModelType,
fields,
WeightType,
Precision,
)
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, TOKEN, QUEUE_REPO, REPO_ID, RESULTS_REPO
from src.populate import get_evaluation_queue_df, get_leaderboard_df
from src.submission.submit import add_new_eval
from captcha.image import ImageCaptcha
from PIL import Image
import random, string
original_df = None
leaderboard_df = None
def restart_space():
API.restart_space(repo_id=REPO_ID, token=TOKEN)
def download_data():
global original_df
global leaderboard_df
try:
print(EVAL_REQUESTS_PATH,QUEUE_REPO)
snapshot_download(
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
)
except Exception:
restart_space()
try:
print(EVAL_RESULTS_PATH, RESULTS_REPO)
snapshot_download(
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
)
except Exception:
restart_space()
_, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
leaderboard_df = original_df.copy()
download_data()
"""
(
finished_eval_queue_df,
running_eval_queue_df,
pending_eval_queue_df,
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
"""
# Searching and filtering
def update_table(
hidden_df: pd.DataFrame,
columns: list,
query: str,
):
columns += "_" # The dataframe does not display the last column - BUG in gradio?
#filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
filtered_df = filter_queries(query, hidden_df)
df = select_columns(filtered_df, columns)
print("TF")
print(df)
return df
def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
print(query)
return df[(df[AutoEvalColumn.eval_name.name].str.contains(query, case=False))]
def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
always_here_cols = [
#AutoEvalColumn.model_type_symbol.name,
AutoEvalColumn.eval_name.name,
]
# We use COLS to maintain sorting
filtered_df = df[
always_here_cols + [c for c in COLS if c in df.columns and c in columns] #+ [AutoEvalColumn.dummy.name]
]
return filtered_df
def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
final_df = []
if query != "" and query is not None:
queries = [q.strip() for q in query.split(";")]
for _q in queries:
_q = _q.strip()
if _q != "":
temp_filtered_df = search_table(filtered_df, _q)
if len(temp_filtered_df) > 0:
final_df.append(temp_filtered_df)
if len(final_df) > 0:
filtered_df = pd.concat(final_df)
filtered_df = filtered_df.drop_duplicates(
subset=[AutoEvalColumn.eval_name.name]
)
return filtered_df
def filter_models(
df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
) -> pd.DataFrame:
# Show all models
#if show_deleted:
# filtered_df = df
#else: # Show only still on the hub models
# filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
filtered_df = df
#type_emoji = [t[0] for t in type_query]
#filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
#filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
#numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
#params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
#mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
#filtered_df = filtered_df.loc[mask]
return filtered_df
def validate_upload(input):
try:
with open(input, mode="r") as f:
data = json.load(f)
#raise gr.Error("Cannot divide by zero!")
except:
raise gr.Error("Cannot parse file")
def generate_captcha(width=300, height=220, length=4):
text = ''.join(random.choices(string.ascii_uppercase + string.digits, k=length))
captcha_obj = ImageCaptcha(width, height)
data = captcha_obj.generate(text)
image = Image.open(data)
return image, text
def validate_captcha(input, text, img):
img, new_text = generate_captcha()
if input.lower() == text.lower():
return True, styled_message("Correct! You can procede with your submission."), new_text, img, ""
return False, styled_error("Incorrect! Please retry with the new code."), new_text, img, ""
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("πŸ… Leaderboard", elem_id="llm-benchmark-tab-table", id=0) as tb_board:
with gr.Row():
with gr.Column():
with gr.Row():
search_bar = gr.Textbox(
placeholder=" πŸ” Search for your model (separate multiple queries with `;`) and press ENTER...",
show_label=False,
elem_id="search-bar",
)
with gr.Row():
shown_columns = gr.CheckboxGroup(
choices=[
c.name
for c in fields(AutoEvalColumn)
if not c.hidden and not c.never_hidden and not c.dummy
],
value=[
c.name
for c in fields(AutoEvalColumn)
if c.displayed_by_default and not c.hidden and not c.never_hidden
],
label="Select columns to show",
elem_id="column-select",
interactive=True,
)
"""
with gr.Column(min_width=320):
# with gr.Box(elem_id="box-filter"):
filter_columns_type = gr.CheckboxGroup(
label="Model types",
choices=[t.to_str() for t in ModelType],
value=[t.to_str() for t in ModelType],
interactive=True,
elem_id="filter-columns-type",
)
filter_columns_precision = gr.CheckboxGroup(
label="Precision",
choices=[i.value.name for i in Precision],
value=[i.value.name for i in Precision],
interactive=True,
elem_id="filter-columns-precision",
)
filter_columns_size = gr.CheckboxGroup(
label="Model sizes (in billions of parameters)",
choices=list(NUMERIC_INTERVALS.keys()),
value=list(NUMERIC_INTERVALS.keys()),
interactive=True,
elem_id="filter-columns-size",
)
"""
gr.Markdown(TABLE_DESC, elem_classes="markdown-text")
#print(shown_columns.value)
leaderboard_table = gr.Dataframe(
value=leaderboard_df[
[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value
],
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
datatype=TYPES_LITE,
elem_id="leaderboard-table",
interactive=False,
visible=True,
wrap=False
)
print(leaderboard_table.value)
print(leaderboard_table.headers)
# Dummy leaderboard for handling the case when the user uses backspace key
hidden_leaderboard_table_for_search = gr.Dataframe(
value=original_df[COLS],
headers=COLS,
datatype=TYPES,
visible=False,
)
search_bar.submit(
update_table,
[
hidden_leaderboard_table_for_search,
shown_columns,
search_bar,
],
leaderboard_table,
)
for selector in [
shown_columns,
]:
selector.change(
update_table,
[
hidden_leaderboard_table_for_search,
shown_columns,
search_bar,
],
leaderboard_table,
queue=True,
)
with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
with gr.Column():
with gr.Row():
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
"""
with gr.Column():
with gr.Accordion(
f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
open=False,
):
with gr.Row():
finished_eval_table = gr.components.Dataframe(
value=finished_eval_queue_df,
headers=EVAL_COLS,
datatype=EVAL_TYPES,
row_count=5,
)
with gr.Accordion(
f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
open=False,
):
with gr.Row():
running_eval_table = gr.components.Dataframe(
value=running_eval_queue_df,
headers=EVAL_COLS,
datatype=EVAL_TYPES,
row_count=5,
)
with gr.Accordion(
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
open=False,
):
with gr.Row():
pending_eval_table = gr.components.Dataframe(
value=pending_eval_queue_df,
headers=EVAL_COLS,
datatype=EVAL_TYPES,
row_count=5,
)
"""
with gr.Row():
gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
with gr.Row():
with gr.Column():
with gr.Group():
model_name_textbox = gr.Textbox(label="Model name", info="Please avoid using the slash (/) character")
#precision = gr.Radio(["bfloat16", "float16", "4bit"], label="Precision", info="What precision are you using for inference?")
precision = gr.Dropdown(
choices=[i.value.name for i in Precision if i != Precision.Unknown],
label="Precision",
multiselect=False,
value="other",
interactive=True,
info="What weight precision were you using during the evaluation?"
)
hf_model_id = gr.Textbox(label="Model link (Optional)", info="URL to the model's Hugging Face repository, or it's official website")
contact_email = gr.Textbox(label="Your E-Mail")
file_input = gr.File(file_count="single", interactive=True, label="Upload json file with evaluation results", file_types=['.json', '.jsonl'])
file_input.upload(validate_upload, file_input)
#upload_button = gr.UploadButton("Upload json", file_types=['.json'])
#upload_button.upload(validate_upload, upload_button, file_input)
with gr.Group():
captcha_correct = gr.State(False)
text = gr.State("")
image, text.value = generate_captcha()
captcha_img = gr.Image(
image,
label="Prove your humanity",
interactive=False,
show_download_button=False,
show_fullscreen_button=False,
show_share_button=False,
)
captcha_input = gr.Textbox(placeholder="Enter the text in the image above", show_label=False, container=False)
check_button = gr.Button("Validate", interactive=True)
captcha_result = gr.Markdown()
check_button.click(
fn = validate_captcha,
inputs = [captcha_input, text, captcha_img],
outputs = [captcha_correct, captcha_result, text, captcha_img, captcha_input],
)
submit_button = gr.Button("Submit Eval", interactive=True)
submission_result = gr.Markdown()
submit_button.click(
fn = add_new_eval,
inputs = [
model_name_textbox,
file_input,
precision,
hf_model_id,
contact_email,
captcha_correct,
],
outputs = [submission_result, captcha_correct],
)
with gr.Row():
with gr.Accordion("πŸ“™ Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=20,
elem_id="citation-button",
show_copy_button=True,
)
demo.load(
fn=generate_captcha,
outputs=[captcha_img, text]
)
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=86400)
scheduler.start()
demo.queue(default_concurrency_limit=40).launch(server_name="0.0.0.0")