File size: 10,910 Bytes
1d31670
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d4577f4
 
 
1d31670
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bcaca18
 
 
1d31670
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bcaca18
 
 
1d31670
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bcaca18
 
 
1d31670
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
import os
import json
import logging
from typing import List, Dict, Any
from pathlib import Path
from huggingface_hub import snapshot_download
from fastapi import HTTPException

from app.config import (
    QUEUE_REPO,
    RESULTS_REPO,
    EVAL_REQUESTS_PATH,
    EVAL_RESULTS_PATH,
    HF_TOKEN
)
from app.core.cache import cache_config

logger = logging.getLogger(__name__)

try:
    from app.leaderboard.read_evals import get_raw_eval_results
    from app.populate import get_leaderboard_df
    from app.display.utils import COLS, BENCHMARK_COLS, Tasks
except ImportError as e:
    # Fallback for development without mounted volume
    logger.warning(f"Could not import original modules: {e}")
    # Define minimal fallbacks
    COLS = ["Model", "Average ⬆️", "Type", "Precision", "Architecture", "Hub License", "Hub ❤️", "#Params (B)", "Available on the hub", "Model sha"]
    BENCHMARK_COLS = ["WinoGrande-IS (3-shot)", "GED", "Inflection (1-shot)", "Belebele (IS)", "ARC-Challenge-IS", "WikiQA-IS"]
    
    class MockTask:
        def __init__(self, name, col_name):
            self.name = name
            self.col_name = col_name
    
    class Tasks:
        task0 = MockTask("winogrande_is", "WinoGrande-IS (3-shot)")
        task1 = MockTask("ged", "GED") 
        task2 = MockTask("inflection", "Inflection (1-shot)")
        task5 = MockTask("belebele_is", "Belebele (IS)")
        task6 = MockTask("arc_challenge_is", "ARC-Challenge-IS")
        task7 = MockTask("wiki_qa_is", "WikiQA-IS")

class IcelandicLeaderboardService:
    def __init__(self):
        self.results_path = EVAL_RESULTS_PATH
        self.requests_path = EVAL_REQUESTS_PATH
        
    async def _ensure_data_available(self):
        """Ensure evaluation data is available locally"""
        try:
            # Download results if not exists or empty
            if not os.path.exists(self.results_path) or not os.listdir(self.results_path):
                logger.info(f"Downloading results to {self.results_path}")
                snapshot_download(
                    repo_id=RESULTS_REPO,
                    local_dir=self.results_path,
                    repo_type="dataset",
                    token=HF_TOKEN,
                    tqdm_class=None,
                    etag_timeout=30
                )
            
            # Download requests if not exists or empty  
            if not os.path.exists(self.requests_path) or not os.listdir(self.requests_path):
                logger.info(f"Downloading requests to {self.requests_path}")
                snapshot_download(
                    repo_id=QUEUE_REPO,
                    local_dir=self.requests_path,
                    repo_type="dataset",
                    token=HF_TOKEN,
                    tqdm_class=None,
                    etag_timeout=30
                )
                
        except Exception as e:
            logger.error(f"Failed to download data: {e}")
            raise HTTPException(status_code=500, detail=f"Failed to download data: {str(e)}")

    async def fetch_raw_data(self) -> List[Dict[str, Any]]:
        """Fetch raw leaderboard data using original Icelandic processing logic"""
        try:
            await self._ensure_data_available()
            
            logger.info("Processing Icelandic leaderboard data")
            
            # Try to use original processing logic if available
            try:
                raw_data, df = get_leaderboard_df(
                    self.results_path, 
                    self.requests_path, 
                    COLS, 
                    BENCHMARK_COLS
                )
                
                # Convert DataFrame to list of dictionaries
                data = df.to_dict('records')
                
                logger.info(f"Processed {len(data)} Icelandic leaderboard entries")
                return data
                
            except NameError:
                # Fallback: return mock data for testing
                logger.warning("Using mock data - original processing modules not available")
                return self._generate_mock_data()
                
        except Exception as e:
            logger.error(f"Failed to fetch Icelandic leaderboard data: {e}")
            raise HTTPException(status_code=500, detail=str(e))
    
    def _generate_mock_data(self) -> List[Dict[str, Any]]:
        """Generate mock data for testing when original modules aren't available"""
        return [
            {
                "Model": "test-model/icelandic-gpt-7b",
                "Average ⬆️": 85.5,
                "Type": "fine-tuned",
                "T": "🔶",
                "Precision": "bfloat16",
                "Architecture": "LlamaForCausalLM",
                "Hub License": "apache-2.0",
                "Hub ❤️": 42,
                "#Params (B)": 7.0,
                "Available on the hub": True,
                "Model sha": "abc123def456",
                "WinoGrande-IS (3-shot)": 78.5,
                "GED": 92.3,
                "Inflection (1-shot)": 85.1,
                "Belebele (IS)": 80.7,
                "ARC-Challenge-IS": 76.2,
                "WikiQA-IS": 89.4,
                "Reasoning": False,
                "Note": ""
            },
            {
                "Model": "test-model/icelandic-llama-13b",
                "Average ⬆️": 88.2,
                "Type": "instruction-tuned",
                "T": "⭕",
                "Precision": "float16",
                "Architecture": "LlamaForCausalLM", 
                "Hub License": "mit",
                "Hub ❤️": 156,
                "#Params (B)": 13.0,
                "Available on the hub": True,
                "Model sha": "def456abc789",
                "WinoGrande-IS (3-shot)": 82.1,
                "GED": 94.8,
                "Inflection (1-shot)": 87.9,
                "Belebele (IS)": 85.3,
                "ARC-Challenge-IS": 79.8,
                "WikiQA-IS": 91.2,
                "Reasoning": True,
                "Note": "reasoning model with 32k thinking budget"
            }
        ]

    async def get_formatted_data(self) -> List[Dict[str, Any]]:
        """Get formatted leaderboard data compatible with React frontend"""
        try:
            raw_data = await self.fetch_raw_data()
            formatted_data = []
            
            for item in raw_data:
                try:
                    formatted_item = await self.transform_data(item)
                    formatted_data.append(formatted_item)
                except Exception as e:
                    logger.error(f"Failed to format entry: {e}")
                    continue
                    
            logger.info(f"Formatted {len(formatted_data)} entries for frontend")
            return formatted_data
            
        except Exception as e:
            logger.error(f"Failed to format leaderboard data: {e}")
            raise HTTPException(status_code=500, detail=str(e))

    async def transform_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
        """Transform Icelandic leaderboard data into format expected by React frontend"""
        
        # Create unique ID and clean model name
        raw_model_name = data.get("Model", "Unknown")
        
        # Extract clean model name from HTML if present
        if '<a target="_blank" href=' in raw_model_name:
            # Parse HTML to extract clean model name
            import re
            match = re.search(r'>([^<]+)</a>', raw_model_name)
            model_name = match.group(1) if match else raw_model_name
        else:
            model_name = raw_model_name
            
        precision = data.get("Precision", "Unknown") 
        revision = data.get("Model sha", "Unknown")
        unique_id = f"{model_name}_{precision}_{revision}"
        
        # Map Icelandic tasks to evaluations format
        evaluations = {}
        task_mapping = {
            "WinoGrande-IS (3-shot)": "winogrande_is",
            "GED": "ged", 
            "Inflection (1-shot)": "inflection",
            "Belebele (IS)": "belebele_is",
            "ARC-Challenge-IS": "arc_challenge_is",
            "WikiQA-IS": "wiki_qa_is"
        }
        
        for task_display_name, task_key in task_mapping.items():
            if task_display_name in data:
                evaluations[task_key] = {
                    "name": task_display_name,
                    "value": data.get(task_display_name, 0),
                    "normalized_score": data.get(task_display_name, 0)
                }
        
        # Extract model type and clean it
        model_type_symbol = data.get("T", "")
        model_type_name = data.get("Type", "Unknown")
        
        # Map Icelandic model types to frontend format
        type_mapping = {
            "pretrained": "pretrained",
            "fine-tuned": "fine-tuned", 
            "instruction-tuned": "instruction-tuned",
            "RL-tuned": "RL-tuned"
        }
        
        clean_model_type = type_mapping.get(model_type_name, model_type_name)
        
        features = {
            "is_not_available_on_hub": not data.get("Available on the hub", True),
            "is_merged": False,  # Not tracked in Icelandic leaderboard
            "is_moe": False,     # Not tracked in Icelandic leaderboard
            "is_flagged": False, # Not tracked in Icelandic leaderboard
            "is_official_provider": False  # Not tracked in Icelandic leaderboard
        }
        
        metadata = {
            "upload_date": None,  # Not available in Icelandic data
            "submission_date": None,  # Not available in Icelandic data
            "generation": None,  # Not available in Icelandic data
            "base_model": None,  # Not available in Icelandic data
            "hub_license": data.get("Hub License", ""),
            "hub_hearts": data.get("Hub ❤️", 0),
            "params_billions": data.get("#Params (B)", 0),
            "co2_cost": 0  # Not tracked in Icelandic leaderboard
        }
        
        transformed_data = {
            "id": unique_id,
            "model": {
                "name": model_name,
                "sha": revision,
                "precision": precision,
                "type": clean_model_type,
                "weight_type": None,  # Not available in Icelandic data
                "architecture": data.get("Architecture", "Unknown"),
                "average_score": data.get("Average ⬆️", 0),
                "has_chat_template": False,  # Not tracked in Icelandic leaderboard
                "reasoning": data.get("Reasoning", False),  # Reasoning enabled flag
                "note": data.get("Note", "")  # Extra model information
            },
            "evaluations": evaluations,
            "features": features,
            "metadata": metadata
        }
        
        return transformed_data