File size: 10,910 Bytes
1d31670 d4577f4 1d31670 bcaca18 1d31670 bcaca18 1d31670 bcaca18 1d31670 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 |
import os
import json
import logging
from typing import List, Dict, Any
from pathlib import Path
from huggingface_hub import snapshot_download
from fastapi import HTTPException
from app.config import (
QUEUE_REPO,
RESULTS_REPO,
EVAL_REQUESTS_PATH,
EVAL_RESULTS_PATH,
HF_TOKEN
)
from app.core.cache import cache_config
logger = logging.getLogger(__name__)
try:
from app.leaderboard.read_evals import get_raw_eval_results
from app.populate import get_leaderboard_df
from app.display.utils import COLS, BENCHMARK_COLS, Tasks
except ImportError as e:
# Fallback for development without mounted volume
logger.warning(f"Could not import original modules: {e}")
# Define minimal fallbacks
COLS = ["Model", "Average ⬆️", "Type", "Precision", "Architecture", "Hub License", "Hub ❤️", "#Params (B)", "Available on the hub", "Model sha"]
BENCHMARK_COLS = ["WinoGrande-IS (3-shot)", "GED", "Inflection (1-shot)", "Belebele (IS)", "ARC-Challenge-IS", "WikiQA-IS"]
class MockTask:
def __init__(self, name, col_name):
self.name = name
self.col_name = col_name
class Tasks:
task0 = MockTask("winogrande_is", "WinoGrande-IS (3-shot)")
task1 = MockTask("ged", "GED")
task2 = MockTask("inflection", "Inflection (1-shot)")
task5 = MockTask("belebele_is", "Belebele (IS)")
task6 = MockTask("arc_challenge_is", "ARC-Challenge-IS")
task7 = MockTask("wiki_qa_is", "WikiQA-IS")
class IcelandicLeaderboardService:
def __init__(self):
self.results_path = EVAL_RESULTS_PATH
self.requests_path = EVAL_REQUESTS_PATH
async def _ensure_data_available(self):
"""Ensure evaluation data is available locally"""
try:
# Download results if not exists or empty
if not os.path.exists(self.results_path) or not os.listdir(self.results_path):
logger.info(f"Downloading results to {self.results_path}")
snapshot_download(
repo_id=RESULTS_REPO,
local_dir=self.results_path,
repo_type="dataset",
token=HF_TOKEN,
tqdm_class=None,
etag_timeout=30
)
# Download requests if not exists or empty
if not os.path.exists(self.requests_path) or not os.listdir(self.requests_path):
logger.info(f"Downloading requests to {self.requests_path}")
snapshot_download(
repo_id=QUEUE_REPO,
local_dir=self.requests_path,
repo_type="dataset",
token=HF_TOKEN,
tqdm_class=None,
etag_timeout=30
)
except Exception as e:
logger.error(f"Failed to download data: {e}")
raise HTTPException(status_code=500, detail=f"Failed to download data: {str(e)}")
async def fetch_raw_data(self) -> List[Dict[str, Any]]:
"""Fetch raw leaderboard data using original Icelandic processing logic"""
try:
await self._ensure_data_available()
logger.info("Processing Icelandic leaderboard data")
# Try to use original processing logic if available
try:
raw_data, df = get_leaderboard_df(
self.results_path,
self.requests_path,
COLS,
BENCHMARK_COLS
)
# Convert DataFrame to list of dictionaries
data = df.to_dict('records')
logger.info(f"Processed {len(data)} Icelandic leaderboard entries")
return data
except NameError:
# Fallback: return mock data for testing
logger.warning("Using mock data - original processing modules not available")
return self._generate_mock_data()
except Exception as e:
logger.error(f"Failed to fetch Icelandic leaderboard data: {e}")
raise HTTPException(status_code=500, detail=str(e))
def _generate_mock_data(self) -> List[Dict[str, Any]]:
"""Generate mock data for testing when original modules aren't available"""
return [
{
"Model": "test-model/icelandic-gpt-7b",
"Average ⬆️": 85.5,
"Type": "fine-tuned",
"T": "🔶",
"Precision": "bfloat16",
"Architecture": "LlamaForCausalLM",
"Hub License": "apache-2.0",
"Hub ❤️": 42,
"#Params (B)": 7.0,
"Available on the hub": True,
"Model sha": "abc123def456",
"WinoGrande-IS (3-shot)": 78.5,
"GED": 92.3,
"Inflection (1-shot)": 85.1,
"Belebele (IS)": 80.7,
"ARC-Challenge-IS": 76.2,
"WikiQA-IS": 89.4,
"Reasoning": False,
"Note": ""
},
{
"Model": "test-model/icelandic-llama-13b",
"Average ⬆️": 88.2,
"Type": "instruction-tuned",
"T": "⭕",
"Precision": "float16",
"Architecture": "LlamaForCausalLM",
"Hub License": "mit",
"Hub ❤️": 156,
"#Params (B)": 13.0,
"Available on the hub": True,
"Model sha": "def456abc789",
"WinoGrande-IS (3-shot)": 82.1,
"GED": 94.8,
"Inflection (1-shot)": 87.9,
"Belebele (IS)": 85.3,
"ARC-Challenge-IS": 79.8,
"WikiQA-IS": 91.2,
"Reasoning": True,
"Note": "reasoning model with 32k thinking budget"
}
]
async def get_formatted_data(self) -> List[Dict[str, Any]]:
"""Get formatted leaderboard data compatible with React frontend"""
try:
raw_data = await self.fetch_raw_data()
formatted_data = []
for item in raw_data:
try:
formatted_item = await self.transform_data(item)
formatted_data.append(formatted_item)
except Exception as e:
logger.error(f"Failed to format entry: {e}")
continue
logger.info(f"Formatted {len(formatted_data)} entries for frontend")
return formatted_data
except Exception as e:
logger.error(f"Failed to format leaderboard data: {e}")
raise HTTPException(status_code=500, detail=str(e))
async def transform_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
"""Transform Icelandic leaderboard data into format expected by React frontend"""
# Create unique ID and clean model name
raw_model_name = data.get("Model", "Unknown")
# Extract clean model name from HTML if present
if '<a target="_blank" href=' in raw_model_name:
# Parse HTML to extract clean model name
import re
match = re.search(r'>([^<]+)</a>', raw_model_name)
model_name = match.group(1) if match else raw_model_name
else:
model_name = raw_model_name
precision = data.get("Precision", "Unknown")
revision = data.get("Model sha", "Unknown")
unique_id = f"{model_name}_{precision}_{revision}"
# Map Icelandic tasks to evaluations format
evaluations = {}
task_mapping = {
"WinoGrande-IS (3-shot)": "winogrande_is",
"GED": "ged",
"Inflection (1-shot)": "inflection",
"Belebele (IS)": "belebele_is",
"ARC-Challenge-IS": "arc_challenge_is",
"WikiQA-IS": "wiki_qa_is"
}
for task_display_name, task_key in task_mapping.items():
if task_display_name in data:
evaluations[task_key] = {
"name": task_display_name,
"value": data.get(task_display_name, 0),
"normalized_score": data.get(task_display_name, 0)
}
# Extract model type and clean it
model_type_symbol = data.get("T", "")
model_type_name = data.get("Type", "Unknown")
# Map Icelandic model types to frontend format
type_mapping = {
"pretrained": "pretrained",
"fine-tuned": "fine-tuned",
"instruction-tuned": "instruction-tuned",
"RL-tuned": "RL-tuned"
}
clean_model_type = type_mapping.get(model_type_name, model_type_name)
features = {
"is_not_available_on_hub": not data.get("Available on the hub", True),
"is_merged": False, # Not tracked in Icelandic leaderboard
"is_moe": False, # Not tracked in Icelandic leaderboard
"is_flagged": False, # Not tracked in Icelandic leaderboard
"is_official_provider": False # Not tracked in Icelandic leaderboard
}
metadata = {
"upload_date": None, # Not available in Icelandic data
"submission_date": None, # Not available in Icelandic data
"generation": None, # Not available in Icelandic data
"base_model": None, # Not available in Icelandic data
"hub_license": data.get("Hub License", ""),
"hub_hearts": data.get("Hub ❤️", 0),
"params_billions": data.get("#Params (B)", 0),
"co2_cost": 0 # Not tracked in Icelandic leaderboard
}
transformed_data = {
"id": unique_id,
"model": {
"name": model_name,
"sha": revision,
"precision": precision,
"type": clean_model_type,
"weight_type": None, # Not available in Icelandic data
"architecture": data.get("Architecture", "Unknown"),
"average_score": data.get("Average ⬆️", 0),
"has_chat_template": False, # Not tracked in Icelandic leaderboard
"reasoning": data.get("Reasoning", False), # Reasoning enabled flag
"note": data.get("Note", "") # Extra model information
},
"evaluations": evaluations,
"features": features,
"metadata": metadata
}
return transformed_data |