Spaces:

mideind
/

icelandic-llm-leaderboard

Running

App Files Files Community

icelandic-llm-leaderboard / backend /app /services /leaderboard.py

gardarjuto

refactor(backend): Consolidate all source files into a single 'app' package

d4577f4 24 days ago

raw

history blame contribute delete

10.9 kB

	import os
	import json
	import logging
	from typing import List, Dict, Any
	from pathlib import Path
	from huggingface_hub import snapshot_download
	from fastapi import HTTPException

	from app.config import (
	QUEUE_REPO,
	RESULTS_REPO,
	EVAL_REQUESTS_PATH,
	EVAL_RESULTS_PATH,
	HF_TOKEN
	)
	from app.core.cache import cache_config

	logger = logging.getLogger(__name__)

	try:
	from app.leaderboard.read_evals import get_raw_eval_results
	from app.populate import get_leaderboard_df
	from app.display.utils import COLS, BENCHMARK_COLS, Tasks
	except ImportError as e:
	# Fallback for development without mounted volume
	logger.warning(f"Could not import original modules: {e}")
	# Define minimal fallbacks
	COLS = ["Model", "Average ⬆️", "Type", "Precision", "Architecture", "Hub License", "Hub ❤️", "#Params (B)", "Available on the hub", "Model sha"]
	BENCHMARK_COLS = ["WinoGrande-IS (3-shot)", "GED", "Inflection (1-shot)", "Belebele (IS)", "ARC-Challenge-IS", "WikiQA-IS"]

	class MockTask:
	def __init__(self, name, col_name):
	self.name = name
	self.col_name = col_name

	class Tasks:
	task0 = MockTask("winogrande_is", "WinoGrande-IS (3-shot)")
	task1 = MockTask("ged", "GED")
	task2 = MockTask("inflection", "Inflection (1-shot)")
	task5 = MockTask("belebele_is", "Belebele (IS)")
	task6 = MockTask("arc_challenge_is", "ARC-Challenge-IS")
	task7 = MockTask("wiki_qa_is", "WikiQA-IS")

	class IcelandicLeaderboardService:
	def __init__(self):
	self.results_path = EVAL_RESULTS_PATH
	self.requests_path = EVAL_REQUESTS_PATH

	async def _ensure_data_available(self):
	"""Ensure evaluation data is available locally"""
	try:
	# Download results if not exists or empty
	if not os.path.exists(self.results_path) or not os.listdir(self.results_path):
	logger.info(f"Downloading results to {self.results_path}")
	snapshot_download(
	repo_id=RESULTS_REPO,
	local_dir=self.results_path,
	repo_type="dataset",
	token=HF_TOKEN,
	tqdm_class=None,
	etag_timeout=30
	)

	# Download requests if not exists or empty
	if not os.path.exists(self.requests_path) or not os.listdir(self.requests_path):
	logger.info(f"Downloading requests to {self.requests_path}")
	snapshot_download(
	repo_id=QUEUE_REPO,
	local_dir=self.requests_path,
	repo_type="dataset",
	token=HF_TOKEN,
	tqdm_class=None,
	etag_timeout=30
	)

	except Exception as e:
	logger.error(f"Failed to download data: {e}")
	raise HTTPException(status_code=500, detail=f"Failed to download data: {str(e)}")

	async def fetch_raw_data(self) -> List[Dict[str, Any]]:
	"""Fetch raw leaderboard data using original Icelandic processing logic"""
	try:
	await self._ensure_data_available()

	logger.info("Processing Icelandic leaderboard data")

	# Try to use original processing logic if available
	try:
	raw_data, df = get_leaderboard_df(
	self.results_path,
	self.requests_path,
	COLS,
	BENCHMARK_COLS
	)

	# Convert DataFrame to list of dictionaries
	data = df.to_dict('records')

	logger.info(f"Processed {len(data)} Icelandic leaderboard entries")
	return data

	except NameError:
	# Fallback: return mock data for testing
	logger.warning("Using mock data - original processing modules not available")
	return self._generate_mock_data()

	except Exception as e:
	logger.error(f"Failed to fetch Icelandic leaderboard data: {e}")
	raise HTTPException(status_code=500, detail=str(e))

	def _generate_mock_data(self) -> List[Dict[str, Any]]:
	"""Generate mock data for testing when original modules aren't available"""
	return [
	{
	"Model": "test-model/icelandic-gpt-7b",
	"Average ⬆️": 85.5,
	"Type": "fine-tuned",
	"T": "🔶",
	"Precision": "bfloat16",
	"Architecture": "LlamaForCausalLM",
	"Hub License": "apache-2.0",
	"Hub ❤️": 42,
	"#Params (B)": 7.0,
	"Available on the hub": True,
	"Model sha": "abc123def456",
	"WinoGrande-IS (3-shot)": 78.5,
	"GED": 92.3,
	"Inflection (1-shot)": 85.1,
	"Belebele (IS)": 80.7,
	"ARC-Challenge-IS": 76.2,
	"WikiQA-IS": 89.4,
	"Reasoning": False,
	"Note": ""
	},
	{
	"Model": "test-model/icelandic-llama-13b",
	"Average ⬆️": 88.2,
	"Type": "instruction-tuned",
	"T": "⭕",
	"Precision": "float16",
	"Architecture": "LlamaForCausalLM",
	"Hub License": "mit",
	"Hub ❤️": 156,
	"#Params (B)": 13.0,
	"Available on the hub": True,
	"Model sha": "def456abc789",
	"WinoGrande-IS (3-shot)": 82.1,
	"GED": 94.8,
	"Inflection (1-shot)": 87.9,
	"Belebele (IS)": 85.3,
	"ARC-Challenge-IS": 79.8,
	"WikiQA-IS": 91.2,
	"Reasoning": True,
	"Note": "reasoning model with 32k thinking budget"
	}
	]

	async def get_formatted_data(self) -> List[Dict[str, Any]]:
	"""Get formatted leaderboard data compatible with React frontend"""
	try:
	raw_data = await self.fetch_raw_data()
	formatted_data = []

	for item in raw_data:
	try:
	formatted_item = await self.transform_data(item)
	formatted_data.append(formatted_item)
	except Exception as e:
	logger.error(f"Failed to format entry: {e}")
	continue

	logger.info(f"Formatted {len(formatted_data)} entries for frontend")
	return formatted_data

	except Exception as e:
	logger.error(f"Failed to format leaderboard data: {e}")
	raise HTTPException(status_code=500, detail=str(e))

	async def transform_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
	"""Transform Icelandic leaderboard data into format expected by React frontend"""

	# Create unique ID and clean model name
	raw_model_name = data.get("Model", "Unknown")

	# Extract clean model name from HTML if present
	if '<a target="_blank" href=' in raw_model_name:
	# Parse HTML to extract clean model name
	import re
	match = re.search(r'>([^<]+)</a>', raw_model_name)
	model_name = match.group(1) if match else raw_model_name
	else:
	model_name = raw_model_name

	precision = data.get("Precision", "Unknown")
	revision = data.get("Model sha", "Unknown")
	unique_id = f"{model_name}_{precision}_{revision}"

	# Map Icelandic tasks to evaluations format
	evaluations = {}
	task_mapping = {
	"WinoGrande-IS (3-shot)": "winogrande_is",
	"GED": "ged",
	"Inflection (1-shot)": "inflection",
	"Belebele (IS)": "belebele_is",
	"ARC-Challenge-IS": "arc_challenge_is",
	"WikiQA-IS": "wiki_qa_is"
	}

	for task_display_name, task_key in task_mapping.items():
	if task_display_name in data:
	evaluations[task_key] = {
	"name": task_display_name,
	"value": data.get(task_display_name, 0),
	"normalized_score": data.get(task_display_name, 0)
	}

	# Extract model type and clean it
	model_type_symbol = data.get("T", "")
	model_type_name = data.get("Type", "Unknown")

	# Map Icelandic model types to frontend format
	type_mapping = {
	"pretrained": "pretrained",
	"fine-tuned": "fine-tuned",
	"instruction-tuned": "instruction-tuned",
	"RL-tuned": "RL-tuned"
	}

	clean_model_type = type_mapping.get(model_type_name, model_type_name)

	features = {
	"is_not_available_on_hub": not data.get("Available on the hub", True),
	"is_merged": False, # Not tracked in Icelandic leaderboard
	"is_moe": False, # Not tracked in Icelandic leaderboard
	"is_flagged": False, # Not tracked in Icelandic leaderboard
	"is_official_provider": False # Not tracked in Icelandic leaderboard
	}

	metadata = {
	"upload_date": None, # Not available in Icelandic data
	"submission_date": None, # Not available in Icelandic data
	"generation": None, # Not available in Icelandic data
	"base_model": None, # Not available in Icelandic data
	"hub_license": data.get("Hub License", ""),
	"hub_hearts": data.get("Hub ❤️", 0),
	"params_billions": data.get("#Params (B)", 0),
	"co2_cost": 0 # Not tracked in Icelandic leaderboard
	}

	transformed_data = {
	"id": unique_id,
	"model": {
	"name": model_name,
	"sha": revision,
	"precision": precision,
	"type": clean_model_type,
	"weight_type": None, # Not available in Icelandic data
	"architecture": data.get("Architecture", "Unknown"),
	"average_score": data.get("Average ⬆️", 0),
	"has_chat_template": False, # Not tracked in Icelandic leaderboard
	"reasoning": data.get("Reasoning", False), # Reasoning enabled flag
	"note": data.get("Note", "") # Extra model information
	},
	"evaluations": evaluations,
	"features": features,
	"metadata": metadata
	}

	return transformed_data