"""Utilities for interacting with the Hugging Face Hub.""" import io import json import os import shutil from pathlib import Path import pandas as pd from huggingface_hub import HfApi, hf_hub_download, list_repo_files from src.config import DATASET_REPO_ID, DS_RESULTS_PATH, DS_SUBMISSIONS_PATH, LDB_COLS # Initialize HfApi try: HF_API = HfApi() print(f"Successfully initialized HfApi. Will use dataset repo: {DATASET_REPO_ID}") except Exception as e: print(f"Failed to initialize HfApi: {e}") HF_API = None def load_leaderboard_data(): """Load leaderboard data from Hugging Face Dataset.""" if not HF_API: return pd.DataFrame(columns=LDB_COLS) leaderboard_entries = [] processed_result_dirs = set() try: # List all files in the results path of the dataset repo_files = HF_API.list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset") # Find all summary files summary_files = [ f for f in repo_files if f.endswith("summary.txt") and f.startswith(DS_RESULTS_PATH + "/") ] summary_files.sort(reverse=True) submissions = [ f for f in repo_files if f.endswith("submission.jsonl") and f.startswith(DS_SUBMISSIONS_PATH + "/") ] metadata_files = [ f for f in repo_files if f.endswith("metadata.json") and f.startswith(DS_SUBMISSIONS_PATH + "/") ] # for file_path in summary_files: for file_path in submissions: dir_name = Path(file_path).parent.name if dir_name in processed_result_dirs: continue # download metadata file of this submission metadata_file = next((f for f in metadata_files if f.startswith(f"{DS_SUBMISSIONS_PATH}/{dir_name}/")), None) if metadata_file: local_metadata_path = hf_hub_download( repo_id=DATASET_REPO_ID, filename=metadata_file, repo_type="dataset", local_dir=os.path.join("local_hf_downloads", dir_name), ) with open(local_metadata_path, "r", encoding="utf-8") as f: metadata = json.load(f) os.remove(local_metadata_path) processed_result_dirs.add(dir_name) entry = {LDB_COLS[0]: dir_name, LDB_COLS[1]: metadata.get("modelling_framework", "Unknown"), LDB_COLS[2]: metadata.get("base_llm", "Unknown"), LDB_COLS[3]: '*Calculating...*', LDB_COLS[4]: '*Calculating...*', LDB_COLS[5]: '*Calculating...*'} # check if summary file exists, otherwise skip if f"{DS_RESULTS_PATH}/{dir_name}/summary.txt" not in repo_files: leaderboard_entries.append(entry) continue # Download summary file local_summary_path = hf_hub_download( repo_id=DATASET_REPO_ID, filename=f"{DS_RESULTS_PATH}/{dir_name}/summary.txt", repo_type="dataset", local_dir=os.path.join("local_hf_downloads", dir_name), ) if Path(local_summary_path).exists(): with open(local_summary_path, "r", encoding="utf-8") as f: for line in f: if 'Error perc' in line: entry[LDB_COLS[5]] = float(line.split(":")[1].strip().replace("%", "")) if 'Final Solution Accuracy' in line: entry[LDB_COLS[4]] = float(line.split(":")[1].strip().replace("%", "")) if 'Submission coverage perc' in line: entry[LDB_COLS[3]] = float(line.split(":")[1].strip().replace("%", "")) os.remove(local_summary_path) else: print(f"Warning: Summary file {local_summary_path} does not exist or is empty.") leaderboard_entries.append(entry) except Exception as e: print(f"Error loading leaderboard data: {e}") if not leaderboard_entries: return pd.DataFrame(columns=LDB_COLS) df = pd.DataFrame(leaderboard_entries) # Sort by "Final Solution Accuracy" descending df[LDB_COLS[4]] = pd.to_numeric(df[LDB_COLS[4]], errors='coerce') # Ensure numeric type df = df.sort_values(by=LDB_COLS[4], ascending=False) return df def upload_submission(uploaded_file, dir_name, report_file, model_framework, base_llm): """Upload submission to Hugging Face Dataset.""" if not HF_API: return False, "Hugging Face API not initialized" try: submission_path = f"{DS_SUBMISSIONS_PATH}/{dir_name}" HF_API.upload_file( path_or_fileobj=uploaded_file, path_in_repo=f"{submission_path}/submission.jsonl", repo_id=DATASET_REPO_ID, repo_type="dataset", commit_message=f"Upload submission: {dir_name}" ) if report_file: HF_API.upload_file( path_or_fileobj=report_file, path_in_repo=f"{submission_path}/report.pdf", repo_id=DATASET_REPO_ID, repo_type="dataset", commit_message=f"Upload report for submission: {dir_name}" ) # create a file for metadata metadata = { "submission_name": dir_name, "modelling_framework": model_framework, "base_llm": base_llm, } HF_API.upload_file( path_or_fileobj=io.BytesIO(json.dumps(metadata, indent=4).encode('utf-8')), path_in_repo=f"{submission_path}/metadata.json", repo_id=DATASET_REPO_ID, repo_type="dataset", commit_message=f"Upload metadata for submission: {dir_name}" ) return True, submission_path except Exception as e: return False, f"Upload error: {str(e)}" def check_name_exists(submission_name): if not HF_API: return False try: repo_files = HF_API.list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset") for file_path in repo_files: if file_path.startswith(f"{DS_SUBMISSIONS_PATH}/{submission_name}"): return True except Exception as e: print(f"Error checking name existence: {e}") return False