"""Utilities for interacting with the Hugging Face Hub.""" import os import shutil from pathlib import Path import pandas as pd from huggingface_hub import HfApi, hf_hub_download, list_repo_files from huggingface_hub.utils import RepositoryNotFoundError, HFValidationError from src.config import DATASET_REPO_ID, DS_RESULTS_PATH, DS_SUBMISSIONS_PATH, LDB_COLS # Initialize HfApi try: HF_API = HfApi() print(f"Successfully initialized HfApi. Will use dataset repo: {DATASET_REPO_ID}") except Exception as e: print(f"Failed to initialize HfApi: {e}") HF_API = None def load_leaderboard_data(): """Load leaderboard data from Hugging Face Dataset.""" if not HF_API: return pd.DataFrame(columns=LDB_COLS) leaderboard_entries = [] processed_result_dirs = set() try: # List all files in the results path of the dataset repo_files = HF_API.list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset") # Find all summary files summary_files = [ f for f in repo_files if f.endswith("summary.txt") and f.startswith(DS_RESULTS_PATH + "/") ] summary_files.sort(reverse=True) for file_path in summary_files: dir_name = Path(file_path).parent.name if dir_name in processed_result_dirs: continue processed_result_dirs.add(dir_name) entry = {LDB_COLS[0]: dir_name, LDB_COLS[1]: 'N/A', LDB_COLS[2]: 'N/A', LDB_COLS[3]: 'N/A', LDB_COLS[4]: 0} # Download summary file temp_dir = os.path.join("temp_hf_downloads", dir_name) local_summary_path = hf_hub_download( repo_id=DATASET_REPO_ID, filename=file_path, repo_type="dataset", local_dir=temp_dir, ) # Count files files_in_result_dir = [ f for f in repo_files if f.startswith(f"{DS_RESULTS_PATH}/{dir_name}/") and not f.endswith("/") ] # Parse score from summary if Path(local_summary_path).exists(): with open(local_summary_path, "r", encoding="utf-8") as f: for line in f: if 'Execution perc' in line: entry[LDB_COLS[1]] = float(line.split(":")[1].strip().replace("%", "")) if 'Consistency perc' in line: entry[LDB_COLS[2]] = float(line.split(":")[1].strip().replace("%", "")) if 'Final Solution Accuracy' in line: entry[LDB_COLS[3]] = float(line.split(":")[1].strip().replace("%", "")) if 'Total Submitted Models Parsed' in line: entry[LDB_COLS[4]] = int(line.split(":")[1].strip()) os.remove(local_summary_path) leaderboard_entries.append(entry) except Exception as e: print(f"Error loading leaderboard data: {e}") finally: # Clean up if Path("temp_hf_downloads").exists(): shutil.rmtree("temp_hf_downloads", ignore_errors=True) if not leaderboard_entries: return pd.DataFrame(columns=LDB_COLS) return pd.DataFrame(leaderboard_entries) def upload_submission(uploaded_file, dir_name): """Upload submission to Hugging Face Dataset.""" if not HF_API: return False, "Hugging Face API not initialized" try: submission_path = f"{DS_SUBMISSIONS_PATH}/{dir_name}" # file_name = os.path.basename(uploaded_file.name) HF_API.upload_file( path_or_fileobj=uploaded_file, path_in_repo=f"{submission_path}/submission.jsonl", repo_id=DATASET_REPO_ID, repo_type="dataset", commit_message=f"Upload submission: {dir_name}" ) return True, submission_path except Exception as e: return False, f"Upload error: {str(e)}" def check_name_exists(submission_name): if not HF_API: return False try: repo_files = HF_API.list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset") for file_path in repo_files: if file_path.startswith(f"{DS_SUBMISSIONS_PATH}/{submission_name}"): return True except Exception as e: print(f"Error checking name existence: {e}") return False