|
import gradio as gr |
|
import json |
|
import os |
|
import random |
|
import datasets |
|
|
|
from dipromats_evaluation_v2 import evaluate_results |
|
|
|
|
|
|
|
|
|
|
|
DATASET_GOLD = "NLP-UNED/dipromats2024-t2_leaderboard-gold" |
|
FILE_GOLD = 'gold_test.json' |
|
|
|
DATASET_RESULTS = "NLP-UNED/dipromats2024-t2_leaderboard-results" |
|
SPLIT_EN = 'results_en' |
|
SPLIT_ES = 'results_es' |
|
FEATURES_RESULTS = datasets.Features({ |
|
"team_name": datasets.Value("string"), |
|
"run_id": datasets.Value("string"), |
|
"description": datasets.Value("string"), |
|
"lenient_f1": datasets.Value("float64"), |
|
"strict_f1": datasets.Value("float64"), |
|
"average_f1": datasets.Value("float64") }) |
|
|
|
EMPTY_RESULT={"team_name": [], "run_id": [], "description": [], "lenient_f1": [], "strict_f1": [], "average_f1": []} |
|
|
|
|
|
try: |
|
from google.colab import userdata |
|
|
|
HF_TOKEN_GOLD = userdata.get('HF_DIPROMATS2024_T2_GOLD_TOKEN') |
|
HF_TOKEN_RESULTS = userdata.get('HF_DIPROMATS2024_T2_RESULTS_TOKEN') |
|
except: |
|
|
|
|
|
HF_TOKEN_GOLD = os.getenv('HF_DIPROMATS2024_T2_GOLD_TOKEN') |
|
HF_TOKEN_RESULTS = os.getenv('HF_DIPROMATS2024_T2_RESULTS_TOKEN') |
|
|
|
|
|
|
|
|
|
|
|
dataset_gold = datasets.load_dataset(DATASET_GOLD, split='train', data_files=FILE_GOLD, token=HF_TOKEN_GOLD) |
|
|
|
|
|
try: |
|
dataset_en = datasets.load_dataset(DATASET_RESULTS, split=SPLIT_EN) |
|
except Exception as e: |
|
print(f"Error loading English dataset: {e}. Creating it...") |
|
dataset_en = datasets.Dataset.from_dict(EMPTY_RESULT, features=FEATURES_RESULTS, split=SPLIT_EN) |
|
dataset_en.push_to_hub(DATASET_RESULTS, split=SPLIT_EN, token=HF_TOKEN_RESULTS) |
|
|
|
|
|
try: |
|
dataset_es = datasets.load_dataset(DATASET_RESULTS, split=SPLIT_ES) |
|
except Exception as e: |
|
print(f"Error loading Spanish dataset: {e}. Creating it...") |
|
dataset_es = datasets.Dataset.from_dict(EMPTY_RESULT, features=FEATURES_RESULTS, split=SPLIT_ES) |
|
dataset_es.push_to_hub(DATASET_RESULTS, split=SPLIT_ES, token=HF_TOKEN_RESULTS) |
|
|
|
|
|
|
|
|
|
def data_to_table(dataset): |
|
table_data = [] |
|
for item in dataset: |
|
table_data.append([item.get("team_name", ""), item.get("run_id", ""), |
|
item.get("lenient_f1", ""), item.get("strict_f1", ""), item.get("average_f1", "")]) |
|
return table_data |
|
|
|
|
|
|
|
def update_leaderboard(lang, file_path, email, team_input, run_id, description, lenient_f1, strict_f1, average_f1): |
|
global dataset_en |
|
global dataset_es |
|
if lang == "en": |
|
dataset = dataset_en |
|
else: |
|
dataset = dataset_es |
|
|
|
warn = False |
|
if not email: |
|
gr.Warning("Email cannot be blank") |
|
warn=True |
|
if not team_input: |
|
gr.Warning("Team name cannot be blank") |
|
warn=True |
|
if not run_id: |
|
gr.Warning("Run ID cannot be blank") |
|
warn=True |
|
if not file_path: |
|
gr.Warning("File cannot be blank") |
|
warn=True |
|
if not description: |
|
gr.Warning("Description cannot be blank") |
|
warn=True |
|
|
|
if warn: |
|
return data_to_table(dataset_en), data_to_table(dataset_es), gr.Tabs(selected=1), gr.Button(visible=False), gr.Column(visible=True), team_input, run_id, description, email, file_path, lenient_f1, strict_f1, average_f1 |
|
|
|
dataset = dataset.add_item({ |
|
"team_name": team_input, |
|
"run_id": run_id, |
|
"description": description, |
|
"lenient_f1": lenient_f1, |
|
"strict_f1": strict_f1, |
|
"average_f1": average_f1 |
|
}) |
|
|
|
dataset.push_to_hub(DATASET_RESULTS, token=HF_TOKEN_RESULTS) |
|
|
|
|
|
if lang == "en": |
|
dataset_en = dataset |
|
else: |
|
dataset_es = dataset |
|
|
|
|
|
return data_to_table(dataset_en), data_to_table(dataset_es), gr.Tabs(selected=0), gr.Button(visible=True), gr.Column(visible=False), "", "", "", "", None, None, None, None |
|
|
|
|
|
def process_file(lang, file_path): |
|
global dataset_gold |
|
|
|
if not file_path: |
|
gr.Warning("File cannot be blank") |
|
return gr.Button(visible=True), gr.Row(visible=False), None, None, None |
|
|
|
with open(file_path, 'r') as f: |
|
test = json.load(f) |
|
|
|
try: |
|
results = evaluate_results(lang, dataset_gold, test) |
|
|
|
except Exception as e: |
|
gr.Warning("Invalid JSON file or Incorrect Language") |
|
print(f"Error in function evaluate_results: {e}.") |
|
print(dataset_gold) |
|
return gr.Button(visible=True), gr.Row(visible=False), None, None, None |
|
|
|
lenient_f1 = results['lenient']['micro']['scores']['f1-score'] |
|
strict_f1 = results['strict']['micro']['scores']['f1-score'] |
|
average_f1 = (lenient_f1 + strict_f1) / 2 |
|
|
|
return gr.Button(visible=False), gr.Row(visible=True), lenient_f1, strict_f1, average_f1 |
|
|
|
|
|
|
|
|
|
with gr.Blocks() as leaderboard: |
|
|
|
gr.Markdown( |
|
""" |
|
# Dipromats 2024 Task 2 Leaderboard |
|
# Automatic Detection of Narratives from Diplomats of Major Powers |
|
These are the leaderboards for DIPROMATS 2024 Task 2 described in <a href=https://nlp.uned.es/dipromats2024>nlp.uned.es/dipromats2024</a>. |
|
The Gold Standard is not publicly available so LLMs cannot be contamined with them. |
|
However, you can submit your results here and get your system automatically evaluated. |
|
Then you will have the choice to submit your results to the leaderboard. |
|
""") |
|
with gr.Tabs() as tabs: |
|
|
|
|
|
with gr.TabItem("English Leaderboard", id=0): |
|
gr.Markdown( |
|
""" |
|
# English Leaderboard |
|
""") |
|
leaderboard_table_en = gr.Dataframe(headers=["Team", "Run ID", "Lenient F1", "Strict F1", "Average F1"], |
|
value=data_to_table(dataset_en), |
|
interactive=False) |
|
|
|
|
|
with gr.TabItem("Spanish Leaderboard", id=2): |
|
gr.Markdown( |
|
""" |
|
# Spanish Leaderboard |
|
""") |
|
leaderboard_table_es = gr.Dataframe(headers=["Team", "Run ID", "Lenient F1", "Strict F1", "Average F1"], |
|
value=data_to_table(dataset_es), |
|
interactive=False) |
|
|
|
|
|
with gr.TabItem("Evaluate your results", id=1): |
|
gr.Markdown( |
|
""" |
|
# Upload your results and get evaluated |
|
Then you can decide to submit your results to the leaderboard or not. |
|
Make sure that you upload a file with the json format described in... |
|
""") |
|
with gr.Row(): |
|
file_input = gr.File(label="Upload a JSON file", file_types=[".json"], type="filepath", file_count="single") |
|
with gr.Column(): |
|
lang = gr.Dropdown(label="Language", choices=["en", "es"], interactive=True) |
|
evaluate_button = gr.Button("Evaluate") |
|
|
|
|
|
with gr.Row(visible=True): |
|
lenient_f1 = gr.Number(label="Lenient F1", interactive=False) |
|
strict_f1 = gr.Number(label="Strict F1", interactive=False) |
|
average_f1 = gr.Number(label="Average F1", interactive=False) |
|
|
|
|
|
with gr.Column(visible=False) as submission_col: |
|
with gr.Row(): |
|
with gr.Column(): |
|
with gr.Row(): |
|
team_input = gr.Textbox(label="Team Name") |
|
run_id = gr.Textbox(label="Run ID") |
|
email_input = gr.Textbox(label="Email (only for submission verification, it won't be shown)") |
|
description_input = gr.Textbox(label="System description", lines=6) |
|
submit_button = gr.Button("Submit to leaderboard") |
|
|
|
evaluate_button.click(process_file, |
|
inputs=[lang, file_input], |
|
outputs=[evaluate_button, submission_col,lenient_f1, strict_f1, average_f1]) |
|
|
|
submit_button.click(update_leaderboard, |
|
inputs=[lang, file_input, email_input, team_input, run_id, description_input, lenient_f1, strict_f1, average_f1], |
|
outputs=[leaderboard_table_en, leaderboard_table_es, tabs, evaluate_button, submission_col, team_input, run_id, description_input, email_input, file_input, lenient_f1, strict_f1, average_f1]) |
|
|
|
leaderboard.launch() |
|
|
|
|