Spaces:

mideind
/

icelandic-llm-leaderboard

Running

File size: 10,910 Bytes

import os
import json
import logging
from typing import List, Dict, Any
from pathlib import Path
from huggingface_hub import snapshot_download
from fastapi import HTTPException

from app.config import (
    QUEUE_REPO,
    RESULTS_REPO,
    EVAL_REQUESTS_PATH,
    EVAL_RESULTS_PATH,
    HF_TOKEN
)
from app.core.cache import cache_config

logger = logging.getLogger(__name__)

try:
    from app.leaderboard.read_evals import get_raw_eval_results
    from app.populate import get_leaderboard_df
    from app.display.utils import COLS, BENCHMARK_COLS, Tasks
except ImportError as e:
    # Fallback for development without mounted volume
    logger.warning(f"Could not import original modules: {e}")
    # Define minimal fallbacks
    COLS = ["Model", "Average ⬆️", "Type", "Precision", "Architecture", "Hub License", "Hub ❤️", "#Params (B)", "Available on the hub", "Model sha"]
    BENCHMARK_COLS = ["WinoGrande-IS (3-shot)", "GED", "Inflection (1-shot)", "Belebele (IS)", "ARC-Challenge-IS", "WikiQA-IS"]
    
    class MockTask:
        def __init__(self, name, col_name):
            self.name = name
            self.col_name = col_name
    
    class Tasks:
        task0 = MockTask("winogrande_is", "WinoGrande-IS (3-shot)")
        task1 = MockTask("ged", "GED") 
        task2 = MockTask("inflection", "Inflection (1-shot)")
        task5 = MockTask("belebele_is", "Belebele (IS)")
        task6 = MockTask("arc_challenge_is", "ARC-Challenge-IS")
        task7 = MockTask("wiki_qa_is", "WikiQA-IS")

class IcelandicLeaderboardService:
    def __init__(self):
        self.results_path = EVAL_RESULTS_PATH
        self.requests_path = EVAL_REQUESTS_PATH
        
    async def _ensure_data_available(self):
        """Ensure evaluation data is available locally"""
        try:
            # Download results if not exists or empty
            if not os.path.exists(self.results_path) or not os.listdir(self.results_path):
                logger.info(f"Downloading results to {self.results_path}")
                snapshot_download(
                    repo_id=RESULTS_REPO,
                    local_dir=self.results_path,
                    repo_type="dataset",
                    token=HF_TOKEN,
                    tqdm_class=None,
                    etag_timeout=30
                )
            
            # Download requests if not exists or empty  
            if not os.path.exists(self.requests_path) or not os.listdir(self.requests_path):
                logger.info(f"Downloading requests to {self.requests_path}")
                snapshot_download(
                    repo_id=QUEUE_REPO,
                    local_dir=self.requests_path,
                    repo_type="dataset",
                    token=HF_TOKEN,
                    tqdm_class=None,
                    etag_timeout=30
                )
                
        except Exception as e:
            logger.error(f"Failed to download data: {e}")
            raise HTTPException(status_code=500, detail=f"Failed to download data: {str(e)}")

    async def fetch_raw_data(self) -> List[Dict[str, Any]]:
        """Fetch raw leaderboard data using original Icelandic processing logic"""
        try:
            await self._ensure_data_available()
            
            logger.info("Processing Icelandic leaderboard data")
            
            # Try to use original processing logic if available
            try:
                raw_data, df = get_leaderboard_df(
                    self.results_path, 
                    self.requests_path, 
                    COLS, 
                    BENCHMARK_COLS
                )
                
                # Convert DataFrame to list of dictionaries
                data = df.to_dict('records')
                
                logger.info(f"Processed {len(data)} Icelandic leaderboard entries")
                return data
                
            except NameError:
                # Fallback: return mock data for testing
                logger.warning("Using mock data - original processing modules not available")
                return self._generate_mock_data()
                
        except Exception as e:
            logger.error(f"Failed to fetch Icelandic leaderboard data: {e}")
            raise HTTPException(status_code=500, detail=str(e))
    
    def _generate_mock_data(self) -> List[Dict[str, Any]]:
        """Generate mock data for testing when original modules aren't available"""
        return [
            {
                "Model": "test-model/icelandic-gpt-7b",
                "Average ⬆️": 85.5,
                "Type": "fine-tuned",
                "T": "🔶",
                "Precision": "bfloat16",
                "Architecture": "LlamaForCausalLM",
                "Hub License": "apache-2.0",
                "Hub ❤️": 42,
                "#Params (B)": 7.0,
                "Available on the hub": True,
                "Model sha": "abc123def456",
                "WinoGrande-IS (3-shot)": 78.5,
                "GED": 92.3,
                "Inflection (1-shot)": 85.1,
                "Belebele (IS)": 80.7,
                "ARC-Challenge-IS": 76.2,
                "WikiQA-IS": 89.4,
                "Reasoning": False,
                "Note": ""
            },
            {
                "Model": "test-model/icelandic-llama-13b",
                "Average ⬆️": 88.2,
                "Type": "instruction-tuned",
                "T": "⭕",
                "Precision": "float16",
                "Architecture": "LlamaForCausalLM", 
                "Hub License": "mit",
                "Hub ❤️": 156,
                "#Params (B)": 13.0,
                "Available on the hub": True,
                "Model sha": "def456abc789",
                "WinoGrande-IS (3-shot)": 82.1,
                "GED": 94.8,
                "Inflection (1-shot)": 87.9,
                "Belebele (IS)": 85.3,
                "ARC-Challenge-IS": 79.8,
                "WikiQA-IS": 91.2,
                "Reasoning": True,
                "Note": "reasoning model with 32k thinking budget"
            }
        ]

    async def get_formatted_data(self) -> List[Dict[str, Any]]:
        """Get formatted leaderboard data compatible with React frontend"""
        try:
            raw_data = await self.fetch_raw_data()
            formatted_data = []
            
            for item in raw_data:
                try:
                    formatted_item = await self.transform_data(item)
                    formatted_data.append(formatted_item)
                except Exception as e:
                    logger.error(f"Failed to format entry: {e}")
                    continue
                    
            logger.info(f"Formatted {len(formatted_data)} entries for frontend")
            return formatted_data
            
        except Exception as e:
            logger.error(f"Failed to format leaderboard data: {e}")
            raise HTTPException(status_code=500, detail=str(e))

    async def transform_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
        """Transform Icelandic leaderboard data into format expected by React frontend"""
        
        # Create unique ID and clean model name
        raw_model_name = data.get("Model", "Unknown")
        
        # Extract clean model name from HTML if present
        if '<a target="_blank" href=' in raw_model_name:
            # Parse HTML to extract clean model name
            import re
            match = re.search(r'>([^<]+)</a>', raw_model_name)
            model_name = match.group(1) if match else raw_model_name
        else:
            model_name = raw_model_name
            
        precision = data.get("Precision", "Unknown") 
        revision = data.get("Model sha", "Unknown")
        unique_id = f"{model_name}_{precision}_{revision}"
        
        # Map Icelandic tasks to evaluations format
        evaluations = {}
        task_mapping = {
            "WinoGrande-IS (3-shot)": "winogrande_is",
            "GED": "ged", 
            "Inflection (1-shot)": "inflection",
            "Belebele (IS)": "belebele_is",
            "ARC-Challenge-IS": "arc_challenge_is",
            "WikiQA-IS": "wiki_qa_is"
        }
        
        for task_display_name, task_key in task_mapping.items():
            if task_display_name in data:
                evaluations[task_key] = {
                    "name": task_display_name,
                    "value": data.get(task_display_name, 0),
                    "normalized_score": data.get(task_display_name, 0)
                }
        
        # Extract model type and clean it
        model_type_symbol = data.get("T", "")
        model_type_name = data.get("Type", "Unknown")
        
        # Map Icelandic model types to frontend format
        type_mapping = {
            "pretrained": "pretrained",
            "fine-tuned": "fine-tuned", 
            "instruction-tuned": "instruction-tuned",
            "RL-tuned": "RL-tuned"
        }
        
        clean_model_type = type_mapping.get(model_type_name, model_type_name)
        
        features = {
            "is_not_available_on_hub": not data.get("Available on the hub", True),
            "is_merged": False,  # Not tracked in Icelandic leaderboard
            "is_moe": False,     # Not tracked in Icelandic leaderboard
            "is_flagged": False, # Not tracked in Icelandic leaderboard
            "is_official_provider": False  # Not tracked in Icelandic leaderboard
        }
        
        metadata = {
            "upload_date": None,  # Not available in Icelandic data
            "submission_date": None,  # Not available in Icelandic data
            "generation": None,  # Not available in Icelandic data
            "base_model": None,  # Not available in Icelandic data
            "hub_license": data.get("Hub License", ""),
            "hub_hearts": data.get("Hub ❤️", 0),
            "params_billions": data.get("#Params (B)", 0),
            "co2_cost": 0  # Not tracked in Icelandic leaderboard
        }
        
        transformed_data = {
            "id": unique_id,
            "model": {
                "name": model_name,
                "sha": revision,
                "precision": precision,
                "type": clean_model_type,
                "weight_type": None,  # Not available in Icelandic data
                "architecture": data.get("Architecture", "Unknown"),
                "average_score": data.get("Average ⬆️", 0),
                "has_chat_template": False,  # Not tracked in Icelandic leaderboard
                "reasoning": data.get("Reasoning", False),  # Reasoning enabled flag
                "note": data.get("Note", "")  # Extra model information
            },
            "evaluations": evaluations,
            "features": features,
            "metadata": metadata
        }
        
        return transformed_data