File size: 6,529 Bytes
180f9fe
e67d561
 
180f9fe
 
 
e67d561
180f9fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27b03dc
 
 
 
 
 
444cb2e
 
 
 
 
27b03dc
 
180f9fe
 
 
27b03dc
444cb2e
 
 
 
 
 
 
 
 
 
 
 
 
 
180f9fe
27b03dc
444cb2e
 
 
 
 
27b03dc
 
 
 
 
 
180f9fe
 
 
27b03dc
180f9fe
f021e7d
180f9fe
27b03dc
180f9fe
 
 
2e2392c
a6f5bd8
2e2392c
a6f5bd8
2e2392c
a6f5bd8
180f9fe
60a95c1
 
180f9fe
 
 
 
 
f021e7d
180f9fe
 
43dd2bb
 
 
 
 
 
 
 
180f9fe
 
444cb2e
180f9fe
 
 
21ed616
180f9fe
 
21ed616
 
 
 
 
 
 
27b03dc
 
 
 
 
 
 
 
e67d561
 
 
 
 
444cb2e
e67d561
 
 
 
 
 
 
 
180f9fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
"""Utilities for interacting with the Hugging Face Hub."""
import io
import json
import os
import shutil
from pathlib import Path

import pandas as pd
from huggingface_hub import HfApi, hf_hub_download, list_repo_files

from src.config import DATASET_REPO_ID, DS_RESULTS_PATH, DS_SUBMISSIONS_PATH, LDB_COLS

# Initialize HfApi
try:
    HF_API = HfApi()
    print(f"Successfully initialized HfApi. Will use dataset repo: {DATASET_REPO_ID}")
except Exception as e:
    print(f"Failed to initialize HfApi: {e}")
    HF_API = None


def load_leaderboard_data():
    """Load leaderboard data from Hugging Face Dataset."""
    if not HF_API:
        return pd.DataFrame(columns=LDB_COLS)

    leaderboard_entries = []
    processed_result_dirs = set()
    
    try:
        # List all files in the results path of the dataset
        repo_files = HF_API.list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset")
        
        # Find all summary files
        summary_files = [
            f for f in repo_files 
            if f.endswith("summary.txt") and f.startswith(DS_RESULTS_PATH + "/")
        ]
        summary_files.sort(reverse=True)

        submissions = [
            f for f in repo_files
            if f.endswith("submission.jsonl") and f.startswith(DS_SUBMISSIONS_PATH + "/")
        ]

        metadata_files = [
            f for f in repo_files
            if f.endswith("metadata.json") and f.startswith(DS_SUBMISSIONS_PATH + "/")
        ]

        # for file_path in summary_files:
        for file_path in submissions:
            dir_name = Path(file_path).parent.name
            if dir_name in processed_result_dirs:
                continue

            # download metadata file of this submission
            metadata_file = next((f for f in metadata_files if f.startswith(f"{DS_SUBMISSIONS_PATH}/{dir_name}/")), None)
            if metadata_file:
                local_metadata_path = hf_hub_download(
                    repo_id=DATASET_REPO_ID,
                    filename=metadata_file,
                    repo_type="dataset",
                    local_dir=os.path.join("local_hf_downloads", dir_name),
                )
                with open(local_metadata_path, "r", encoding="utf-8") as f:
                    metadata = json.load(f)
                os.remove(local_metadata_path)


            processed_result_dirs.add(dir_name)
            entry = {LDB_COLS[0]: dir_name,
                     LDB_COLS[1]: metadata.get("modelling_framework", "Unknown"),
                     LDB_COLS[2]: metadata.get("base_llm", "Unknown"),
                     LDB_COLS[3]: '*Calculating...*',
                     LDB_COLS[4]: '*Calculating...*',
                     LDB_COLS[5]: '*Calculating...*'}

            # check if summary file exists, otherwise skip
            if f"{DS_RESULTS_PATH}/{dir_name}/summary.txt" not in repo_files:
                leaderboard_entries.append(entry)
                continue

            # Download summary file
            local_summary_path = hf_hub_download(
                repo_id=DATASET_REPO_ID,
                filename=f"{DS_RESULTS_PATH}/{dir_name}/summary.txt",
                repo_type="dataset",
                local_dir=os.path.join("local_hf_downloads", dir_name),
            )

            if Path(local_summary_path).exists():
                with open(local_summary_path, "r", encoding="utf-8") as f:
                    for line in f:
                        if 'Error perc' in line:
                            entry[LDB_COLS[5]] = float(line.split(":")[1].strip().replace("%", ""))
                        if 'Final Solution Accuracy' in line:
                            entry[LDB_COLS[4]] = float(line.split(":")[1].strip().replace("%", ""))
                        if 'Submission coverage perc' in line:
                            entry[LDB_COLS[3]] = float(line.split(":")[1].strip().replace("%", ""))
                os.remove(local_summary_path)
            else:
                print(f"Warning: Summary file {local_summary_path} does not exist or is empty.")
                
            leaderboard_entries.append(entry)
            
    except Exception as e:
        print(f"Error loading leaderboard data: {e}")

    if not leaderboard_entries:
        return pd.DataFrame(columns=LDB_COLS)

    df = pd.DataFrame(leaderboard_entries)

    # Sort by "Final Solution Accuracy" descending
    df[LDB_COLS[4]] = pd.to_numeric(df[LDB_COLS[4]], errors='coerce')  # Ensure numeric type
    df = df.sort_values(by=LDB_COLS[4], ascending=False)

    return df


def upload_submission(uploaded_file, dir_name, report_file, model_framework, base_llm):
    """Upload submission to Hugging Face Dataset."""
    if not HF_API:
        return False, "Hugging Face API not initialized"

    try:
        submission_path = f"{DS_SUBMISSIONS_PATH}/{dir_name}"
        HF_API.upload_file(
            path_or_fileobj=uploaded_file,
            path_in_repo=f"{submission_path}/submission.jsonl",
            repo_id=DATASET_REPO_ID,
            repo_type="dataset",
            commit_message=f"Upload submission: {dir_name}"
        )
        if report_file:
            HF_API.upload_file(
                path_or_fileobj=report_file,
                path_in_repo=f"{submission_path}/report.pdf",
                repo_id=DATASET_REPO_ID,
                repo_type="dataset",
                commit_message=f"Upload report for submission: {dir_name}"
            )

        # create a file for metadata
        metadata = {
            "submission_name": dir_name,
            "modelling_framework": model_framework,
            "base_llm": base_llm,
        }
        HF_API.upload_file(
            path_or_fileobj=io.BytesIO(json.dumps(metadata, indent=4).encode('utf-8')),
            path_in_repo=f"{submission_path}/metadata.json",
            repo_id=DATASET_REPO_ID,
            repo_type="dataset",
            commit_message=f"Upload metadata for submission: {dir_name}"
        )
        
        return True, submission_path
    except Exception as e:
        return False, f"Upload error: {str(e)}"


def check_name_exists(submission_name):
    if not HF_API:
        return False

    try:
        repo_files = HF_API.list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset")
        for file_path in repo_files:
            if file_path.startswith(f"{DS_SUBMISSIONS_PATH}/{submission_name}"):
                return True
    except Exception as e:
        print(f"Error checking name existence: {e}")

    return False