File size: 4,573 Bytes
180f9fe 21ed616 180f9fe 21ed616 180f9fe 21ed616 180f9fe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
"""Utilities for interacting with the Hugging Face Hub."""
import os
import shutil
from pathlib import Path
import pandas as pd
from huggingface_hub import HfApi, hf_hub_download, list_repo_files
from huggingface_hub.utils import RepositoryNotFoundError, HFValidationError
from src.config import DATASET_REPO_ID, DS_RESULTS_PATH, DS_SUBMISSIONS_PATH, LDB_COLS
# Initialize HfApi
try:
HF_API = HfApi()
print(f"Successfully initialized HfApi. Will use dataset repo: {DATASET_REPO_ID}")
except Exception as e:
print(f"Failed to initialize HfApi: {e}")
HF_API = None
def load_leaderboard_data():
"""Load leaderboard data from Hugging Face Dataset."""
if not HF_API:
return pd.DataFrame(columns=LDB_COLS)
leaderboard_entries = []
processed_result_dirs = set()
try:
# List all files in the results path of the dataset
repo_files = HF_API.list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset")
# Find all summary files
summary_files = [
f for f in repo_files
if f.endswith("summary.txt") and f.startswith(DS_RESULTS_PATH + "/")
]
summary_files.sort(reverse=True)
for file_path in summary_files:
dir_name = Path(file_path).parent.name
if dir_name in processed_result_dirs:
continue
processed_result_dirs.add(dir_name)
entry = {LDB_COLS[0]: dir_name, LDB_COLS[1]: 'N/A', LDB_COLS[2]: 'N/A', LDB_COLS[3]: 'N/A', LDB_COLS[4]: 0}
# Download summary file
temp_dir = os.path.join("temp_hf_downloads", dir_name)
local_summary_path = hf_hub_download(
repo_id=DATASET_REPO_ID,
filename=file_path,
repo_type="dataset",
local_dir=temp_dir,
)
# Count files
files_in_result_dir = [
f for f in repo_files
if f.startswith(f"{DS_RESULTS_PATH}/{dir_name}/") and not f.endswith("/")
]
# Parse score from summary
if Path(local_summary_path).exists():
with open(local_summary_path, "r", encoding="utf-8") as f:
for line in f:
if 'Execution perc' in line:
entry[LDB_COLS[1]] = float(line.split(":")[1].strip().replace("%", ""))
if 'Consistency perc' in line:
entry[LDB_COLS[2]] = float(line.split(":")[1].strip().replace("%", ""))
if 'Final Solution Accuracy' in line:
entry[LDB_COLS[3]] = float(line.split(":")[1].strip().replace("%", ""))
if 'Total Submitted Models Parsed' in line:
entry[LDB_COLS[4]] = int(line.split(":")[1].strip())
os.remove(local_summary_path)
leaderboard_entries.append(entry)
except Exception as e:
print(f"Error loading leaderboard data: {e}")
finally:
# Clean up
if Path("temp_hf_downloads").exists():
shutil.rmtree("temp_hf_downloads", ignore_errors=True)
if not leaderboard_entries:
return pd.DataFrame(columns=LDB_COLS)
return pd.DataFrame(leaderboard_entries)
def upload_submission(uploaded_file, dir_name):
"""Upload submission to Hugging Face Dataset."""
if not HF_API:
return False, "Hugging Face API not initialized"
try:
submission_path = f"{DS_SUBMISSIONS_PATH}/{dir_name}"
# file_name = os.path.basename(uploaded_file.name)
HF_API.upload_file(
path_or_fileobj=uploaded_file,
path_in_repo=f"{submission_path}/submission.jsonl",
repo_id=DATASET_REPO_ID,
repo_type="dataset",
commit_message=f"Upload submission: {dir_name}"
)
return True, submission_path
except Exception as e:
return False, f"Upload error: {str(e)}"
def check_name_exists(submission_name):
if not HF_API:
return False
try:
repo_files = HF_API.list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset")
for file_path in repo_files:
if file_path.startswith(f"{DS_SUBMISSIONS_PATH}/{submission_name}"):
return True
except Exception as e:
print(f"Error checking name existence: {e}")
return False |