|
"""Utilities for interacting with the Hugging Face Hub.""" |
|
import io |
|
import json |
|
import os |
|
import shutil |
|
from pathlib import Path |
|
|
|
import pandas as pd |
|
from huggingface_hub import HfApi, hf_hub_download, list_repo_files |
|
|
|
from src.config import DATASET_REPO_ID, DS_RESULTS_PATH, DS_SUBMISSIONS_PATH, LDB_COLS |
|
|
|
|
|
try: |
|
HF_API = HfApi() |
|
print(f"Successfully initialized HfApi. Will use dataset repo: {DATASET_REPO_ID}") |
|
except Exception as e: |
|
print(f"Failed to initialize HfApi: {e}") |
|
HF_API = None |
|
|
|
|
|
def load_leaderboard_data(): |
|
"""Load leaderboard data from Hugging Face Dataset.""" |
|
if not HF_API: |
|
return pd.DataFrame(columns=LDB_COLS) |
|
|
|
leaderboard_entries = [] |
|
processed_result_dirs = set() |
|
|
|
try: |
|
|
|
repo_files = HF_API.list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset") |
|
|
|
|
|
summary_files = [ |
|
f for f in repo_files |
|
if f.endswith("summary.txt") and f.startswith(DS_RESULTS_PATH + "/") |
|
] |
|
summary_files.sort(reverse=True) |
|
|
|
submissions = [ |
|
f for f in repo_files |
|
if f.endswith("submission.jsonl") and f.startswith(DS_SUBMISSIONS_PATH + "/") |
|
] |
|
|
|
metadata_files = [ |
|
f for f in repo_files |
|
if f.endswith("metadata.json") and f.startswith(DS_SUBMISSIONS_PATH + "/") |
|
] |
|
|
|
|
|
for file_path in submissions: |
|
dir_name = Path(file_path).parent.name |
|
if dir_name in processed_result_dirs: |
|
continue |
|
|
|
|
|
metadata_file = next((f for f in metadata_files if f.startswith(f"{DS_SUBMISSIONS_PATH}/{dir_name}/")), None) |
|
if metadata_file: |
|
local_metadata_path = hf_hub_download( |
|
repo_id=DATASET_REPO_ID, |
|
filename=metadata_file, |
|
repo_type="dataset", |
|
local_dir=os.path.join("local_hf_downloads", dir_name), |
|
) |
|
with open(local_metadata_path, "r", encoding="utf-8") as f: |
|
metadata = json.load(f) |
|
os.remove(local_metadata_path) |
|
|
|
|
|
processed_result_dirs.add(dir_name) |
|
entry = {LDB_COLS[0]: dir_name, |
|
LDB_COLS[1]: metadata.get("modelling_framework", "Unknown"), |
|
LDB_COLS[2]: metadata.get("base_llm", "Unknown"), |
|
LDB_COLS[3]: '*Calculating...*', |
|
LDB_COLS[4]: '*Calculating...*', |
|
LDB_COLS[5]: '*Calculating...*'} |
|
|
|
|
|
if f"{DS_RESULTS_PATH}/{dir_name}/summary.txt" not in repo_files: |
|
leaderboard_entries.append(entry) |
|
continue |
|
|
|
|
|
local_summary_path = hf_hub_download( |
|
repo_id=DATASET_REPO_ID, |
|
filename=f"{DS_RESULTS_PATH}/{dir_name}/summary.txt", |
|
repo_type="dataset", |
|
local_dir=os.path.join("local_hf_downloads", dir_name), |
|
) |
|
|
|
if Path(local_summary_path).exists(): |
|
with open(local_summary_path, "r", encoding="utf-8") as f: |
|
for line in f: |
|
if 'Error perc' in line: |
|
entry[LDB_COLS[5]] = float(line.split(":")[1].strip().replace("%", "")) |
|
if 'Final Solution Accuracy' in line: |
|
entry[LDB_COLS[4]] = float(line.split(":")[1].strip().replace("%", "")) |
|
if 'Submission coverage perc' in line: |
|
entry[LDB_COLS[3]] = float(line.split(":")[1].strip().replace("%", "")) |
|
os.remove(local_summary_path) |
|
else: |
|
print(f"Warning: Summary file {local_summary_path} does not exist or is empty.") |
|
|
|
leaderboard_entries.append(entry) |
|
|
|
except Exception as e: |
|
print(f"Error loading leaderboard data: {e}") |
|
|
|
if not leaderboard_entries: |
|
return pd.DataFrame(columns=LDB_COLS) |
|
|
|
df = pd.DataFrame(leaderboard_entries) |
|
|
|
|
|
df[LDB_COLS[4]] = pd.to_numeric(df[LDB_COLS[4]], errors='coerce') |
|
df = df.sort_values(by=LDB_COLS[4], ascending=False) |
|
|
|
return df |
|
|
|
|
|
def upload_submission(uploaded_file, dir_name, report_file, model_framework, base_llm): |
|
"""Upload submission to Hugging Face Dataset.""" |
|
if not HF_API: |
|
return False, "Hugging Face API not initialized" |
|
|
|
try: |
|
submission_path = f"{DS_SUBMISSIONS_PATH}/{dir_name}" |
|
HF_API.upload_file( |
|
path_or_fileobj=uploaded_file, |
|
path_in_repo=f"{submission_path}/submission.jsonl", |
|
repo_id=DATASET_REPO_ID, |
|
repo_type="dataset", |
|
commit_message=f"Upload submission: {dir_name}" |
|
) |
|
if report_file: |
|
HF_API.upload_file( |
|
path_or_fileobj=report_file, |
|
path_in_repo=f"{submission_path}/report.pdf", |
|
repo_id=DATASET_REPO_ID, |
|
repo_type="dataset", |
|
commit_message=f"Upload report for submission: {dir_name}" |
|
) |
|
|
|
|
|
metadata = { |
|
"submission_name": dir_name, |
|
"modelling_framework": model_framework, |
|
"base_llm": base_llm, |
|
} |
|
HF_API.upload_file( |
|
path_or_fileobj=io.BytesIO(json.dumps(metadata, indent=4).encode('utf-8')), |
|
path_in_repo=f"{submission_path}/metadata.json", |
|
repo_id=DATASET_REPO_ID, |
|
repo_type="dataset", |
|
commit_message=f"Upload metadata for submission: {dir_name}" |
|
) |
|
|
|
return True, submission_path |
|
except Exception as e: |
|
return False, f"Upload error: {str(e)}" |
|
|
|
|
|
def check_name_exists(submission_name): |
|
if not HF_API: |
|
return False |
|
|
|
try: |
|
repo_files = HF_API.list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset") |
|
for file_path in repo_files: |
|
if file_path.startswith(f"{DS_SUBMISSIONS_PATH}/{submission_name}"): |
|
return True |
|
except Exception as e: |
|
print(f"Error checking name existence: {e}") |
|
|
|
return False |