File size: 4,573 Bytes
180f9fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21ed616
180f9fe
 
 
21ed616
180f9fe
 
 
21ed616
 
 
 
 
 
 
 
180f9fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
"""Utilities for interacting with the Hugging Face Hub."""

import os
import shutil
from pathlib import Path
import pandas as pd
from huggingface_hub import HfApi, hf_hub_download, list_repo_files
from huggingface_hub.utils import RepositoryNotFoundError, HFValidationError

from src.config import DATASET_REPO_ID, DS_RESULTS_PATH, DS_SUBMISSIONS_PATH, LDB_COLS

# Initialize HfApi
try:
    HF_API = HfApi()
    print(f"Successfully initialized HfApi. Will use dataset repo: {DATASET_REPO_ID}")
except Exception as e:
    print(f"Failed to initialize HfApi: {e}")
    HF_API = None


def load_leaderboard_data():
    """Load leaderboard data from Hugging Face Dataset."""
    if not HF_API:
        return pd.DataFrame(columns=LDB_COLS)

    leaderboard_entries = []
    processed_result_dirs = set()
    
    try:
        # List all files in the results path of the dataset
        repo_files = HF_API.list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset")
        
        # Find all summary files
        summary_files = [
            f for f in repo_files 
            if f.endswith("summary.txt") and f.startswith(DS_RESULTS_PATH + "/")
        ]
        summary_files.sort(reverse=True)
        
        for file_path in summary_files:
            dir_name = Path(file_path).parent.name
            if dir_name in processed_result_dirs:
                continue
                
            processed_result_dirs.add(dir_name)
            entry = {LDB_COLS[0]: dir_name, LDB_COLS[1]: 'N/A', LDB_COLS[2]: 'N/A', LDB_COLS[3]: 'N/A', LDB_COLS[4]: 0}
            
            # Download summary file
            temp_dir = os.path.join("temp_hf_downloads", dir_name)
            local_summary_path = hf_hub_download(
                repo_id=DATASET_REPO_ID,
                filename=file_path,
                repo_type="dataset",
                local_dir=temp_dir,
            )
            
            # Count files
            files_in_result_dir = [
                f for f in repo_files
                if f.startswith(f"{DS_RESULTS_PATH}/{dir_name}/") and not f.endswith("/")
            ]
            
            # Parse score from summary
            if Path(local_summary_path).exists():
                with open(local_summary_path, "r", encoding="utf-8") as f:
                    for line in f:
                        if 'Execution perc' in line:
                            entry[LDB_COLS[1]] = float(line.split(":")[1].strip().replace("%", ""))
                        if 'Consistency perc' in line:
                            entry[LDB_COLS[2]] = float(line.split(":")[1].strip().replace("%", ""))
                        if 'Final Solution Accuracy' in line:
                            entry[LDB_COLS[3]] = float(line.split(":")[1].strip().replace("%", ""))
                        if 'Total Submitted Models Parsed' in line:
                            entry[LDB_COLS[4]] = int(line.split(":")[1].strip())
                os.remove(local_summary_path)
                
            leaderboard_entries.append(entry)
            
    except Exception as e:
        print(f"Error loading leaderboard data: {e}")
        
    finally:
        # Clean up
        if Path("temp_hf_downloads").exists():
            shutil.rmtree("temp_hf_downloads", ignore_errors=True)
            
    if not leaderboard_entries:
        return pd.DataFrame(columns=LDB_COLS)
        
    return pd.DataFrame(leaderboard_entries)


def upload_submission(uploaded_file, dir_name):
    """Upload submission to Hugging Face Dataset."""
    if not HF_API:
        return False, "Hugging Face API not initialized"

    try:
        submission_path = f"{DS_SUBMISSIONS_PATH}/{dir_name}"

        # file_name = os.path.basename(uploaded_file.name)
        HF_API.upload_file(
            path_or_fileobj=uploaded_file,
            path_in_repo=f"{submission_path}/submission.jsonl",
            repo_id=DATASET_REPO_ID,
            repo_type="dataset",
            commit_message=f"Upload submission: {dir_name}"
        )
        
        return True, submission_path
    except Exception as e:
        return False, f"Upload error: {str(e)}"


def check_name_exists(submission_name):
    if not HF_API:
        return False

    try:
        repo_files = HF_API.list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset")
        for file_path in repo_files:
            if file_path.startswith(f"{DS_SUBMISSIONS_PATH}/{submission_name}"):
                return True
    except Exception as e:
        print(f"Error checking name existence: {e}")

    return False