|
"""Utilities for interacting with the Hugging Face Hub.""" |
|
import io |
|
import json |
|
import os |
|
import shutil |
|
from pathlib import Path |
|
|
|
import pandas as pd |
|
from huggingface_hub import HfApi, hf_hub_download, list_repo_files |
|
|
|
from src.config import DATASET_REPO_ID, DS_RESULTS_PATH, DS_SUBMISSIONS_PATH, LDB_COLS |
|
|
|
|
|
try: |
|
HF_API = HfApi() |
|
print(f"Successfully initialized HfApi. Will use dataset repo: {DATASET_REPO_ID}") |
|
except Exception as e: |
|
print(f"Failed to initialize HfApi: {e}") |
|
HF_API = None |
|
|
|
|
|
def load_leaderboard_data(): |
|
"""Load leaderboard data from Hugging Face Dataset.""" |
|
if not HF_API: |
|
return pd.DataFrame(columns=LDB_COLS) |
|
|
|
leaderboard_entries = [] |
|
processed_result_dirs = set() |
|
|
|
try: |
|
|
|
repo_files = HF_API.list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset") |
|
|
|
|
|
summary_files = [ |
|
f for f in repo_files |
|
if f.endswith("summary.txt") and f.startswith(DS_RESULTS_PATH + "/") |
|
] |
|
summary_files.sort(reverse=True) |
|
|
|
submissions = [ |
|
f for f in repo_files |
|
if f.endswith("submission.jsonl") and f.startswith(DS_SUBMISSIONS_PATH + "/") |
|
] |
|
|
|
|
|
for file_path in submissions: |
|
dir_name = Path(file_path).parent.name |
|
if dir_name in processed_result_dirs: |
|
continue |
|
|
|
processed_result_dirs.add(dir_name) |
|
entry = {LDB_COLS[0]: dir_name, |
|
LDB_COLS[1]: 'In Progress...', |
|
LDB_COLS[2]: 'In Progress...', |
|
LDB_COLS[3]: 'In Progress...', |
|
LDB_COLS[4]: 'In Progress...'} |
|
|
|
|
|
if f"{DS_RESULTS_PATH}/{dir_name}/summary.txt" not in repo_files: |
|
leaderboard_entries.append(entry) |
|
continue |
|
|
|
|
|
local_summary_path = hf_hub_download( |
|
repo_id=DATASET_REPO_ID, |
|
filename=f"{DS_RESULTS_PATH}/{dir_name}/summary.txt", |
|
repo_type="dataset", |
|
local_dir=os.path.join("local_hf_downloads", dir_name), |
|
) |
|
|
|
if Path(local_summary_path).exists(): |
|
with open(local_summary_path, "r", encoding="utf-8") as f: |
|
for line in f: |
|
if 'Execution perc' in line: |
|
entry[LDB_COLS[1]] = float(line.split(":")[1].strip().replace("%", "")) |
|
if 'Consistency perc' in line: |
|
entry[LDB_COLS[2]] = float(line.split(":")[1].strip().replace("%", "")) |
|
if 'Final Solution Accuracy' in line: |
|
entry[LDB_COLS[3]] = float(line.split(":")[1].strip().replace("%", "")) |
|
if 'Total Submitted Models Parsed' in line: |
|
entry[LDB_COLS[4]] = int(line.split(":")[1].strip()) |
|
os.remove(local_summary_path) |
|
|
|
leaderboard_entries.append(entry) |
|
|
|
except Exception as e: |
|
print(f"Error loading leaderboard data: {e}") |
|
|
|
if not leaderboard_entries: |
|
return pd.DataFrame(columns=LDB_COLS) |
|
|
|
return pd.DataFrame(leaderboard_entries) |
|
|
|
|
|
def upload_submission(uploaded_file, dir_name, report_file, model_framework): |
|
"""Upload submission to Hugging Face Dataset.""" |
|
if not HF_API: |
|
return False, "Hugging Face API not initialized" |
|
|
|
try: |
|
submission_path = f"{DS_SUBMISSIONS_PATH}/{dir_name}" |
|
HF_API.upload_file( |
|
path_or_fileobj=uploaded_file, |
|
path_in_repo=f"{submission_path}/submission.jsonl", |
|
repo_id=DATASET_REPO_ID, |
|
repo_type="dataset", |
|
commit_message=f"Upload submission: {dir_name}" |
|
) |
|
if report_file: |
|
HF_API.upload_file( |
|
path_or_fileobj=report_file, |
|
path_in_repo=f"{submission_path}/report.pdf", |
|
repo_id=DATASET_REPO_ID, |
|
repo_type="dataset", |
|
commit_message=f"Upload report for submission: {dir_name}" |
|
) |
|
|
|
|
|
metadata = { |
|
"submission_name": dir_name, |
|
"modelling_framework": model_framework, |
|
} |
|
HF_API.upload_file( |
|
path_or_fileobj=io.BytesIO(json.dumps(metadata, indent=4).encode('utf-8')), |
|
path_in_repo=f"{submission_path}/metadata.json", |
|
repo_id=DATASET_REPO_ID, |
|
repo_type="dataset", |
|
commit_message=f"Upload metadata for submission: {dir_name}" |
|
) |
|
|
|
return True, submission_path |
|
except Exception as e: |
|
return False, f"Upload error: {str(e)}" |
|
|
|
|
|
def check_name_exists(submission_name): |
|
if not HF_API: |
|
return False |
|
|
|
try: |
|
repo_files = HF_API.list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset") |
|
for file_path in repo_files: |
|
if file_path.startswith(f"{DS_SUBMISSIONS_PATH}/{submission_name}"): |
|
return True |
|
except Exception as e: |
|
print(f"Error checking name existence: {e}") |
|
|
|
return False |