File size: 6,529 Bytes
180f9fe e67d561 180f9fe e67d561 180f9fe 27b03dc 444cb2e 27b03dc 180f9fe 27b03dc 444cb2e 180f9fe 27b03dc 444cb2e 27b03dc 180f9fe 27b03dc 180f9fe f021e7d 180f9fe 27b03dc 180f9fe 2e2392c a6f5bd8 2e2392c a6f5bd8 2e2392c a6f5bd8 180f9fe 60a95c1 180f9fe f021e7d 180f9fe 43dd2bb 180f9fe 444cb2e 180f9fe 21ed616 180f9fe 21ed616 27b03dc e67d561 444cb2e e67d561 180f9fe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
"""Utilities for interacting with the Hugging Face Hub."""
import io
import json
import os
import shutil
from pathlib import Path
import pandas as pd
from huggingface_hub import HfApi, hf_hub_download, list_repo_files
from src.config import DATASET_REPO_ID, DS_RESULTS_PATH, DS_SUBMISSIONS_PATH, LDB_COLS
# Initialize HfApi
try:
HF_API = HfApi()
print(f"Successfully initialized HfApi. Will use dataset repo: {DATASET_REPO_ID}")
except Exception as e:
print(f"Failed to initialize HfApi: {e}")
HF_API = None
def load_leaderboard_data():
"""Load leaderboard data from Hugging Face Dataset."""
if not HF_API:
return pd.DataFrame(columns=LDB_COLS)
leaderboard_entries = []
processed_result_dirs = set()
try:
# List all files in the results path of the dataset
repo_files = HF_API.list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset")
# Find all summary files
summary_files = [
f for f in repo_files
if f.endswith("summary.txt") and f.startswith(DS_RESULTS_PATH + "/")
]
summary_files.sort(reverse=True)
submissions = [
f for f in repo_files
if f.endswith("submission.jsonl") and f.startswith(DS_SUBMISSIONS_PATH + "/")
]
metadata_files = [
f for f in repo_files
if f.endswith("metadata.json") and f.startswith(DS_SUBMISSIONS_PATH + "/")
]
# for file_path in summary_files:
for file_path in submissions:
dir_name = Path(file_path).parent.name
if dir_name in processed_result_dirs:
continue
# download metadata file of this submission
metadata_file = next((f for f in metadata_files if f.startswith(f"{DS_SUBMISSIONS_PATH}/{dir_name}/")), None)
if metadata_file:
local_metadata_path = hf_hub_download(
repo_id=DATASET_REPO_ID,
filename=metadata_file,
repo_type="dataset",
local_dir=os.path.join("local_hf_downloads", dir_name),
)
with open(local_metadata_path, "r", encoding="utf-8") as f:
metadata = json.load(f)
os.remove(local_metadata_path)
processed_result_dirs.add(dir_name)
entry = {LDB_COLS[0]: dir_name,
LDB_COLS[1]: metadata.get("modelling_framework", "Unknown"),
LDB_COLS[2]: metadata.get("base_llm", "Unknown"),
LDB_COLS[3]: '*Calculating...*',
LDB_COLS[4]: '*Calculating...*',
LDB_COLS[5]: '*Calculating...*'}
# check if summary file exists, otherwise skip
if f"{DS_RESULTS_PATH}/{dir_name}/summary.txt" not in repo_files:
leaderboard_entries.append(entry)
continue
# Download summary file
local_summary_path = hf_hub_download(
repo_id=DATASET_REPO_ID,
filename=f"{DS_RESULTS_PATH}/{dir_name}/summary.txt",
repo_type="dataset",
local_dir=os.path.join("local_hf_downloads", dir_name),
)
if Path(local_summary_path).exists():
with open(local_summary_path, "r", encoding="utf-8") as f:
for line in f:
if 'Error perc' in line:
entry[LDB_COLS[5]] = float(line.split(":")[1].strip().replace("%", ""))
if 'Final Solution Accuracy' in line:
entry[LDB_COLS[4]] = float(line.split(":")[1].strip().replace("%", ""))
if 'Submission coverage perc' in line:
entry[LDB_COLS[3]] = float(line.split(":")[1].strip().replace("%", ""))
os.remove(local_summary_path)
else:
print(f"Warning: Summary file {local_summary_path} does not exist or is empty.")
leaderboard_entries.append(entry)
except Exception as e:
print(f"Error loading leaderboard data: {e}")
if not leaderboard_entries:
return pd.DataFrame(columns=LDB_COLS)
df = pd.DataFrame(leaderboard_entries)
# Sort by "Final Solution Accuracy" descending
df[LDB_COLS[4]] = pd.to_numeric(df[LDB_COLS[4]], errors='coerce') # Ensure numeric type
df = df.sort_values(by=LDB_COLS[4], ascending=False)
return df
def upload_submission(uploaded_file, dir_name, report_file, model_framework, base_llm):
"""Upload submission to Hugging Face Dataset."""
if not HF_API:
return False, "Hugging Face API not initialized"
try:
submission_path = f"{DS_SUBMISSIONS_PATH}/{dir_name}"
HF_API.upload_file(
path_or_fileobj=uploaded_file,
path_in_repo=f"{submission_path}/submission.jsonl",
repo_id=DATASET_REPO_ID,
repo_type="dataset",
commit_message=f"Upload submission: {dir_name}"
)
if report_file:
HF_API.upload_file(
path_or_fileobj=report_file,
path_in_repo=f"{submission_path}/report.pdf",
repo_id=DATASET_REPO_ID,
repo_type="dataset",
commit_message=f"Upload report for submission: {dir_name}"
)
# create a file for metadata
metadata = {
"submission_name": dir_name,
"modelling_framework": model_framework,
"base_llm": base_llm,
}
HF_API.upload_file(
path_or_fileobj=io.BytesIO(json.dumps(metadata, indent=4).encode('utf-8')),
path_in_repo=f"{submission_path}/metadata.json",
repo_id=DATASET_REPO_ID,
repo_type="dataset",
commit_message=f"Upload metadata for submission: {dir_name}"
)
return True, submission_path
except Exception as e:
return False, f"Upload error: {str(e)}"
def check_name_exists(submission_name):
if not HF_API:
return False
try:
repo_files = HF_API.list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset")
for file_path in repo_files:
if file_path.startswith(f"{DS_SUBMISSIONS_PATH}/{submission_name}"):
return True
except Exception as e:
print(f"Error checking name existence: {e}")
return False |