|
import os |
|
import json |
|
import logging |
|
from typing import List, Dict, Any |
|
from pathlib import Path |
|
from huggingface_hub import snapshot_download |
|
from fastapi import HTTPException |
|
|
|
from app.config import ( |
|
QUEUE_REPO, |
|
RESULTS_REPO, |
|
EVAL_REQUESTS_PATH, |
|
EVAL_RESULTS_PATH, |
|
HF_TOKEN |
|
) |
|
from app.core.cache import cache_config |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
try: |
|
from app.leaderboard.read_evals import get_raw_eval_results |
|
from app.populate import get_leaderboard_df |
|
from app.display.utils import COLS, BENCHMARK_COLS, Tasks |
|
except ImportError as e: |
|
|
|
logger.warning(f"Could not import original modules: {e}") |
|
|
|
COLS = ["Model", "Average ⬆️", "Type", "Precision", "Architecture", "Hub License", "Hub ❤️", "#Params (B)", "Available on the hub", "Model sha"] |
|
BENCHMARK_COLS = ["WinoGrande-IS (3-shot)", "GED", "Inflection (1-shot)", "Belebele (IS)", "ARC-Challenge-IS", "WikiQA-IS"] |
|
|
|
class MockTask: |
|
def __init__(self, name, col_name): |
|
self.name = name |
|
self.col_name = col_name |
|
|
|
class Tasks: |
|
task0 = MockTask("winogrande_is", "WinoGrande-IS (3-shot)") |
|
task1 = MockTask("ged", "GED") |
|
task2 = MockTask("inflection", "Inflection (1-shot)") |
|
task5 = MockTask("belebele_is", "Belebele (IS)") |
|
task6 = MockTask("arc_challenge_is", "ARC-Challenge-IS") |
|
task7 = MockTask("wiki_qa_is", "WikiQA-IS") |
|
|
|
class IcelandicLeaderboardService: |
|
def __init__(self): |
|
self.results_path = EVAL_RESULTS_PATH |
|
self.requests_path = EVAL_REQUESTS_PATH |
|
|
|
async def _ensure_data_available(self): |
|
"""Ensure evaluation data is available locally""" |
|
try: |
|
|
|
if not os.path.exists(self.results_path) or not os.listdir(self.results_path): |
|
logger.info(f"Downloading results to {self.results_path}") |
|
snapshot_download( |
|
repo_id=RESULTS_REPO, |
|
local_dir=self.results_path, |
|
repo_type="dataset", |
|
token=HF_TOKEN, |
|
tqdm_class=None, |
|
etag_timeout=30 |
|
) |
|
|
|
|
|
if not os.path.exists(self.requests_path) or not os.listdir(self.requests_path): |
|
logger.info(f"Downloading requests to {self.requests_path}") |
|
snapshot_download( |
|
repo_id=QUEUE_REPO, |
|
local_dir=self.requests_path, |
|
repo_type="dataset", |
|
token=HF_TOKEN, |
|
tqdm_class=None, |
|
etag_timeout=30 |
|
) |
|
|
|
except Exception as e: |
|
logger.error(f"Failed to download data: {e}") |
|
raise HTTPException(status_code=500, detail=f"Failed to download data: {str(e)}") |
|
|
|
async def fetch_raw_data(self) -> List[Dict[str, Any]]: |
|
"""Fetch raw leaderboard data using original Icelandic processing logic""" |
|
try: |
|
await self._ensure_data_available() |
|
|
|
logger.info("Processing Icelandic leaderboard data") |
|
|
|
|
|
try: |
|
raw_data, df = get_leaderboard_df( |
|
self.results_path, |
|
self.requests_path, |
|
COLS, |
|
BENCHMARK_COLS |
|
) |
|
|
|
|
|
data = df.to_dict('records') |
|
|
|
logger.info(f"Processed {len(data)} Icelandic leaderboard entries") |
|
return data |
|
|
|
except NameError: |
|
|
|
logger.warning("Using mock data - original processing modules not available") |
|
return self._generate_mock_data() |
|
|
|
except Exception as e: |
|
logger.error(f"Failed to fetch Icelandic leaderboard data: {e}") |
|
raise HTTPException(status_code=500, detail=str(e)) |
|
|
|
def _generate_mock_data(self) -> List[Dict[str, Any]]: |
|
"""Generate mock data for testing when original modules aren't available""" |
|
return [ |
|
{ |
|
"Model": "test-model/icelandic-gpt-7b", |
|
"Average ⬆️": 85.5, |
|
"Type": "fine-tuned", |
|
"T": "🔶", |
|
"Precision": "bfloat16", |
|
"Architecture": "LlamaForCausalLM", |
|
"Hub License": "apache-2.0", |
|
"Hub ❤️": 42, |
|
"#Params (B)": 7.0, |
|
"Available on the hub": True, |
|
"Model sha": "abc123def456", |
|
"WinoGrande-IS (3-shot)": 78.5, |
|
"GED": 92.3, |
|
"Inflection (1-shot)": 85.1, |
|
"Belebele (IS)": 80.7, |
|
"ARC-Challenge-IS": 76.2, |
|
"WikiQA-IS": 89.4, |
|
"Reasoning": False, |
|
"Note": "" |
|
}, |
|
{ |
|
"Model": "test-model/icelandic-llama-13b", |
|
"Average ⬆️": 88.2, |
|
"Type": "instruction-tuned", |
|
"T": "⭕", |
|
"Precision": "float16", |
|
"Architecture": "LlamaForCausalLM", |
|
"Hub License": "mit", |
|
"Hub ❤️": 156, |
|
"#Params (B)": 13.0, |
|
"Available on the hub": True, |
|
"Model sha": "def456abc789", |
|
"WinoGrande-IS (3-shot)": 82.1, |
|
"GED": 94.8, |
|
"Inflection (1-shot)": 87.9, |
|
"Belebele (IS)": 85.3, |
|
"ARC-Challenge-IS": 79.8, |
|
"WikiQA-IS": 91.2, |
|
"Reasoning": True, |
|
"Note": "reasoning model with 32k thinking budget" |
|
} |
|
] |
|
|
|
async def get_formatted_data(self) -> List[Dict[str, Any]]: |
|
"""Get formatted leaderboard data compatible with React frontend""" |
|
try: |
|
raw_data = await self.fetch_raw_data() |
|
formatted_data = [] |
|
|
|
for item in raw_data: |
|
try: |
|
formatted_item = await self.transform_data(item) |
|
formatted_data.append(formatted_item) |
|
except Exception as e: |
|
logger.error(f"Failed to format entry: {e}") |
|
continue |
|
|
|
logger.info(f"Formatted {len(formatted_data)} entries for frontend") |
|
return formatted_data |
|
|
|
except Exception as e: |
|
logger.error(f"Failed to format leaderboard data: {e}") |
|
raise HTTPException(status_code=500, detail=str(e)) |
|
|
|
async def transform_data(self, data: Dict[str, Any]) -> Dict[str, Any]: |
|
"""Transform Icelandic leaderboard data into format expected by React frontend""" |
|
|
|
|
|
raw_model_name = data.get("Model", "Unknown") |
|
|
|
|
|
if '<a target="_blank" href=' in raw_model_name: |
|
|
|
import re |
|
match = re.search(r'>([^<]+)</a>', raw_model_name) |
|
model_name = match.group(1) if match else raw_model_name |
|
else: |
|
model_name = raw_model_name |
|
|
|
precision = data.get("Precision", "Unknown") |
|
revision = data.get("Model sha", "Unknown") |
|
unique_id = f"{model_name}_{precision}_{revision}" |
|
|
|
|
|
evaluations = {} |
|
task_mapping = { |
|
"WinoGrande-IS (3-shot)": "winogrande_is", |
|
"GED": "ged", |
|
"Inflection (1-shot)": "inflection", |
|
"Belebele (IS)": "belebele_is", |
|
"ARC-Challenge-IS": "arc_challenge_is", |
|
"WikiQA-IS": "wiki_qa_is" |
|
} |
|
|
|
for task_display_name, task_key in task_mapping.items(): |
|
if task_display_name in data: |
|
evaluations[task_key] = { |
|
"name": task_display_name, |
|
"value": data.get(task_display_name, 0), |
|
"normalized_score": data.get(task_display_name, 0) |
|
} |
|
|
|
|
|
model_type_symbol = data.get("T", "") |
|
model_type_name = data.get("Type", "Unknown") |
|
|
|
|
|
type_mapping = { |
|
"pretrained": "pretrained", |
|
"fine-tuned": "fine-tuned", |
|
"instruction-tuned": "instruction-tuned", |
|
"RL-tuned": "RL-tuned" |
|
} |
|
|
|
clean_model_type = type_mapping.get(model_type_name, model_type_name) |
|
|
|
features = { |
|
"is_not_available_on_hub": not data.get("Available on the hub", True), |
|
"is_merged": False, |
|
"is_moe": False, |
|
"is_flagged": False, |
|
"is_official_provider": False |
|
} |
|
|
|
metadata = { |
|
"upload_date": None, |
|
"submission_date": None, |
|
"generation": None, |
|
"base_model": None, |
|
"hub_license": data.get("Hub License", ""), |
|
"hub_hearts": data.get("Hub ❤️", 0), |
|
"params_billions": data.get("#Params (B)", 0), |
|
"co2_cost": 0 |
|
} |
|
|
|
transformed_data = { |
|
"id": unique_id, |
|
"model": { |
|
"name": model_name, |
|
"sha": revision, |
|
"precision": precision, |
|
"type": clean_model_type, |
|
"weight_type": None, |
|
"architecture": data.get("Architecture", "Unknown"), |
|
"average_score": data.get("Average ⬆️", 0), |
|
"has_chat_template": False, |
|
"reasoning": data.get("Reasoning", False), |
|
"note": data.get("Note", "") |
|
}, |
|
"evaluations": evaluations, |
|
"features": features, |
|
"metadata": metadata |
|
} |
|
|
|
return transformed_data |