Spaces:

Enderchef
/

AI-Leaderboard

Sleeping

App Files Files Community

Quazim0t0 commited on Mar 21

Commit

6b3e7b5

verified ·

1 Parent(s): 5149f76

Upload 12 files

Browse files

Files changed (11) hide show

app.py +115 -0
auth.py +309 -0
benchmark_selection.py +511 -0
database_schema.py +393 -0
evaluation_queue.py +947 -0
leaderboard.py +381 -0
requirements.txt +23 -0
sample_benchmarks.py +66 -0
space.yaml +23 -0
test_app.py +237 -0
todo.md +48 -0

app.py ADDED Viewed

	@@ -0,0 +1,115 @@

+"""
+Main application for Dynamic Highscores system.
+This file integrates all components into a unified application.
+"""
+import os
+import gradio as gr
+import threading
+import queue
+from database_schema import init_db
+from auth import HuggingFaceAuth, create_login_ui, setup_auth_handlers
+from benchmark_selection import BenchmarkSelector, create_benchmark_selection_ui
+from evaluation_queue import EvaluationQueue, create_model_submission_ui
+from leaderboard import Leaderboard, create_leaderboard_ui
+from sample_benchmarks import add_sample_benchmarks
+# Initialize database
+db = init_db()
+# Initialize components
+auth_manager = HuggingFaceAuth(db)
+benchmark_selector = BenchmarkSelector(db, auth_manager)
+evaluation_queue = EvaluationQueue(db, auth_manager)
+leaderboard = Leaderboard(db)
+# Initialize sample benchmarks if none exist
+benchmarks = db.get_benchmarks()
+if not benchmarks or len(benchmarks) == 0:
+    print("No benchmarks found. Adding sample benchmarks...")
+    num_added = add_sample_benchmarks()
+    print(f"Added {num_added} sample benchmarks.")
+# Custom CSS
+css = """
+.info-text {
+    background-color: #f0f7ff;
+    padding: 12px;
+    border-radius: 8px;
+    border-left: 4px solid #3498db;
+    margin: 12px 0;
+}
+.container {
+    max-width: 1200px;
+    margin: 0 auto;
+}
+.header {
+    text-align: center;
+    margin-bottom: 20px;
+}
+.footer {
+    text-align: center;
+    margin-top: 40px;
+    padding: 20px;
+    border-top: 1px solid #eee;
+}
+"""
+# Create Gradio app
+with gr.Blocks(css=css, title="Dynamic Highscores") as app:
+    gr.Markdown("# 🏆 Dynamic Highscores", elem_classes=["header"])
+    gr.Markdown("""
+    Welcome to Dynamic Highscores - a community benchmark platform for evaluating and comparing language models.
+    - **Add your own benchmarks** from HuggingFace datasets
+    - **Submit your models** for CPU-only evaluation
+    - **Compare performance** across different models and benchmarks
+    - **Filter results** by model type (Merge, Agent, Reasoning, Coding, etc.)
+    """, elem_classes=["info-text"])
+    # Authentication UI
+    login_button, logout_button, token_input, user_info = create_login_ui()
+    setup_auth_handlers(login_button, logout_button, token_input, user_info, auth_manager)
+    # Main tabs
+    with gr.Tabs() as tabs:
+        with gr.TabItem("📊 Leaderboard", id=0):
+            # Fix: Pass db_manager parameter to create_leaderboard_ui
+            leaderboard_ui = create_leaderboard_ui(leaderboard, db)
+        with gr.TabItem("🚀 Submit Model", id=1):
+            submission_ui = create_model_submission_ui(evaluation_queue, auth_manager, db)
+        with gr.TabItem("🔍 Benchmarks", id=2):
+            benchmark_ui = create_benchmark_selection_ui(benchmark_selector, auth_manager)
+    gr.Markdown("""
+    ### About Dynamic Highscores
+    This platform allows users to select benchmarks from HuggingFace datasets and evaluate models against them.
+    Each user can submit one benchmark per day (admin users are exempt from this limit).
+    All evaluations run on CPU only to ensure fair comparisons.
+    Created by Quazim0t0
+    """, elem_classes=["footer"])
+# Start evaluation queue worker after app is defined
+# This prevents the worker from starting before the app is fully initialized
+def start_queue_worker():
+    # Wait a moment to ensure app is initialized
+    import time
+    time.sleep(2)
+    evaluation_queue.start_worker()
+# Launch the app
+if __name__ == "__main__":
+    # Start queue worker in a separate thread to avoid SQLite thread issues
+    queue_thread = threading.Thread(target=start_queue_worker)
+    queue_thread.daemon = True
+    queue_thread.start()
+    app.launch()

auth.py ADDED Viewed

	@@ -0,0 +1,309 @@

+"""
+Authentication module for Dynamic Highscores system.
+This module handles user authentication with HuggingFace,
+user session management, and access control.
+"""
+import os
+import json
+import time
+import requests
+import gradio as gr
+from huggingface_hub import HfApi, login
+from functools import wraps
+class HuggingFaceAuth:
+    """Authentication manager for HuggingFace integration."""
+    def __init__(self, db_manager):
+        """Initialize the authentication manager.
+        Args:
+            db_manager: Database manager instance for user storage
+        """
+        self.db_manager = db_manager
+        self.hf_api = HfApi()
+        self.admin_username = os.environ.get("ADMIN_USERNAME", "Quazim0t0")
+    def login_user(self, token):
+        """Log in a user with their HuggingFace token.
+        Args:
+            token: HuggingFace API token
+        Returns:
+            dict: User information if login successful, None otherwise
+        """
+        try:
+            # Validate token with HuggingFace
+            login(token=token, add_to_git_credential=False)
+            # Get user info from HuggingFace
+            user_info = self.hf_api.whoami(token=token)
+            if not user_info:
+                return None
+            # Check if user exists in our database, create if not
+            username = user_info.get("name", user_info.get("fullname", ""))
+            hf_user_id = user_info.get("id", "")
+            if not hf_user_id:
+                return None
+            # Check if this is the admin account
+            is_admin = (username == self.admin_username)
+            # Add or get user from database
+            user_id = self.db_manager.add_user(username, hf_user_id, is_admin)
+            # Get complete user info from database
+            user = self.db_manager.get_user(hf_user_id)
+            if user:
+                # Add token to user info for session only (not stored in database)
+                user['token'] = token
+                return user
+            return None
+        except Exception as e:
+            print(f"Login error: {e}")
+            return None
+    def check_login(self, request: gr.Request):
+        """Check if a user is logged in from a Gradio request.
+        Args:
+            request: Gradio request object
+        Returns:
+            dict: User information if logged in, None otherwise
+        """
+        if not request:
+            return None
+        # Get token from cookies
+        token = request.cookies.get("hf_token")
+        if not token:
+            return None
+        try:
+            # Validate token with HuggingFace
+            user_info = self.hf_api.whoami(token=token)
+            if not user_info:
+                return None
+            # Get user from database
+            hf_user_id = user_info.get("id", "")
+            user = self.db_manager.get_user(hf_user_id)
+            if user:
+                # Add token to user info for session only (not stored in database)
+                user['token'] = token
+                return user
+            return None
+        except Exception as e:
+            print(f"Check login error: {e}")
+            return None
+    def require_login(self, func):
+        """Decorator to require login for a function.
+        Args:
+            func: Function to decorate
+        Returns:
+            Function: Decorated function that requires login
+        """
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            # Find the request argument
+            request = None
+            for arg in args:
+                if isinstance(arg, gr.Request):
+                    request = arg
+                    break
+            if not request and 'request' in kwargs:
+                request = kwargs['request']
+            if not request:
+                return "Please log in to access this feature."
+            # Check if user is logged in
+            user = self.check_login(request)
+            if not user:
+                return "Please log in to access this feature."
+            # Add user to kwargs
+            kwargs['user'] = user
+            # Call the original function
+            return func(*args, **kwargs)
+        return wrapper
+    def require_admin(self, func):
+        """Decorator to require admin privileges for a function.
+        Args:
+            func: Function to decorate
+        Returns:
+            Function: Decorated function that requires admin privileges
+        """
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            # Find the request argument
+            request = None
+            for arg in args:
+                if isinstance(arg, gr.Request):
+                    request = arg
+                    break
+            if not request and 'request' in kwargs:
+                request = kwargs['request']
+            if not request:
+                return "Admin access required."
+            # Check if user is logged in
+            user = self.check_login(request)
+            if not user:
+                return "Admin access required."
+            # Check if user is admin
+            if not user.get('is_admin', False):
+                return "Admin access required."
+            # Add user to kwargs
+            kwargs['user'] = user
+            # Call the original function
+            return func(*args, **kwargs)
+        return wrapper
+    def can_submit_benchmark(self, user_id):
+        """Check if a user can submit a benchmark today.
+        Args:
+            user_id: User ID to check
+        Returns:
+            bool: True if user can submit, False otherwise
+        """
+        return self.db_manager.can_submit_today(user_id)
+    def update_submission_date(self, user_id):
+        """Update the last submission date for a user.
+        Args:
+            user_id: User ID to update
+        """
+        self.db_manager.update_submission_date(user_id)
+# Authentication UI components
+def create_login_ui():
+    """Create the login UI components.
+    Returns:
+        tuple: (login_button, logout_button, token_input, user_info)
+    """
+    with gr.Row():
+        with gr.Column(scale=3):
+            token_input = gr.Textbox(
+                placeholder="Enter your HuggingFace token",
+                label="HuggingFace Token",
+                type="password",
+                visible=True,
+                info="Your token is only stored temporarily in browser session cookies and is never saved permanently"
+            )
+            login_button = gr.Button("Login")
+            logout_button = gr.Button("Logout", visible=False)
+        with gr.Column(scale=2):
+            user_info = gr.Markdown("Not logged in")
+    return login_button, logout_button, token_input, user_info
+def login_handler(token, auth_manager):
+    """Handle login button click.
+    Args:
+        token: HuggingFace token
+        auth_manager: Authentication manager instance
+    Returns:
+        tuple: Updated UI components visibility and user info
+    """
+    if not token:
+        return gr.update(visible=True), gr.update(visible=False), "Please enter your HuggingFace token"
+    user = auth_manager.login_user(token)
+    if user:
+        # Set cookie in JavaScript with session-only flag (no persistent storage)
+        # Cookie will expire when browser is closed
+        js = f"""
+        document.cookie = "hf_token={token}; path=/; SameSite=Strict";
+        """
+        # Return updated UI components
+        return (
+            gr.update(visible=False),  # Hide token input
+            gr.update(visible=True),   # Show logout button
+            f"Logged in as {user['username']}"  # Update user info
+        )
+    else:
+        return (
+            gr.update(visible=True),   # Keep token input visible
+            gr.update(visible=False),  # Hide logout button
+            "Login failed. Please check your token and try again."  # Update user info
+        )
+def logout_handler():
+    """Handle logout button click.
+    Returns:
+        tuple: Updated UI components visibility and user info
+    """
+    # Clear cookie in JavaScript
+    js = """
+    document.cookie = "hf_token=; path=/; max-age=0; SameSite=Strict";
+    """
+    # Return updated UI components
+    return (
+        gr.update(visible=True),   # Show token input
+        gr.update(visible=False),  # Hide logout button
+        "Logged out"               # Update user info
+    )
+def setup_auth_handlers(login_button, logout_button, token_input, user_info, auth_manager):
+    """Set up event handlers for authentication UI components.
+    Args:
+        login_button: Login button component
+        logout_button: Logout button component
+        token_input: Token input component
+        user_info: User info component
+        auth_manager: Authentication manager instance
+    """
+    login_button.click(
+        fn=lambda token: login_handler(token, auth_manager),
+        inputs=[token_input],
+        outputs=[token_input, logout_button, user_info]
+    )
+    logout_button.click(
+        fn=logout_handler,
+        inputs=[],
+        outputs=[token_input, logout_button, user_info]
+    )

benchmark_selection.py ADDED Viewed

	@@ -0,0 +1,511 @@

+"""
+Benchmark selection module for Dynamic Highscores system.
+This module handles browsing, selection, and loading of HuggingFace datasets
+to be used as benchmarks for model evaluation.
+"""
+import os
+import json
+import gradio as gr
+from huggingface_hub import HfApi, list_datasets
+from datasets import load_dataset, get_dataset_config_names
+from functools import partial
+class BenchmarkSelector:
+    """Benchmark selection manager for HuggingFace datasets."""
+    def __init__(self, db_manager, auth_manager):
+        """Initialize the benchmark selector.
+        Args:
+            db_manager: Database manager instance for benchmark storage
+            auth_manager: Authentication manager instance for access control
+        """
+        self.db_manager = db_manager
+        self.auth_manager = auth_manager
+        self.hf_api = HfApi()
+        # Common benchmark categories for filtering
+        self.categories = [
+            "All",
+            "Text Generation",
+            "Question Answering",
+            "Summarization",
+            "Translation",
+            "Classification",
+            "Code Generation",
+            "Reasoning",
+            "Math"
+        ]
+        # Common metrics for different benchmark types
+        self.metric_templates = {
+            "Text Generation": ["bleu", "rouge", "meteor"],
+            "Question Answering": ["exact_match", "f1"],
+            "Summarization": ["rouge1", "rouge2", "rougeL"],
+            "Translation": ["bleu", "ter"],
+            "Classification": ["accuracy", "f1", "precision", "recall"],
+            "Code Generation": ["exact_match", "pass@k", "functional_correctness"],
+            "Reasoning": ["accuracy", "consistency"],
+            "Math": ["accuracy", "correct_steps"]
+        }
+    def search_datasets(self, query, category="All", limit=50):
+        """Search for datasets on HuggingFace.
+        Args:
+            query: Search query string
+            category: Dataset category to filter by
+            limit: Maximum number of results to return
+        Returns:
+            list: List of dataset information dictionaries
+        """
+        try:
+            # Apply category filter if not "All"
+            filter_str = None
+            if category != "All":
+                filter_str = f"task_categories:{category}"
+            # Search for datasets
+            datasets = list_datasets(
+                search=query,
+                filter=filter_str,
+                limit=limit
+            )
+            # Format results
+            results = []
+            for dataset in datasets:
+                results.append({
+                    "id": dataset.id,
+                    "name": dataset.id.split("/")[-1],
+                    "author": dataset.author,
+                    "description": dataset.description[:200] + "..." if dataset.description and len(dataset.description) > 200 else dataset.description,
+                    "tags": dataset.tags,
+                    "downloads": dataset.downloads
+                })
+            return results
+        except Exception as e:
+            print(f"Dataset search error: {e}")
+            return []
+    def get_dataset_info(self, dataset_id):
+        """Get detailed information about a dataset.
+        Args:
+            dataset_id: HuggingFace dataset ID
+        Returns:
+            dict: Dataset information
+        """
+        try:
+            # Get dataset info from HuggingFace
+            dataset_info = self.hf_api.dataset_info(dataset_id)
+            # Get available configurations
+            configs = get_dataset_config_names(dataset_id)
+            # Format result
+            result = {
+                "id": dataset_info.id,
+                "name": dataset_info.id.split("/")[-1],
+                "author": dataset_info.author,
+                "description": dataset_info.description,
+                "citation": dataset_info.citation,
+                "configs": configs,
+                "tags": dataset_info.tags,
+                "downloads": dataset_info.downloads
+            }
+            return result
+        except Exception as e:
+            print(f"Dataset info error: {e}")
+            return None
+    def load_dataset_sample(self, dataset_id, config=None, split="train", sample_size=5):
+        """Load a sample from a dataset.
+        Args:
+            dataset_id: HuggingFace dataset ID
+            config: Dataset configuration name
+            split: Dataset split to sample from
+            sample_size: Number of samples to load
+        Returns:
+            dict: Dataset sample information
+        """
+        try:
+            # Load dataset
+            if config:
+                dataset = load_dataset(dataset_id, config, split=split)
+            else:
+                dataset = load_dataset(dataset_id, split=split)
+            # Get sample
+            if len(dataset) > sample_size:
+                sample = dataset.select(range(sample_size))
+            else:
+                sample = dataset
+            # Get features
+            features = list(sample.features.keys())
+            # Convert sample to list of dictionaries
+            sample_data = []
+            for item in sample:
+                sample_item = {}
+                for key in features:
+                    # Convert non-serializable values to strings
+                    if isinstance(item[key], (list, dict)):
+                        sample_item[key] = str(item[key])
+                    else:
+                        sample_item[key] = item[key]
+                sample_data.append(sample_item)
+            # Format result
+            result = {
+                "id": dataset_id,
+                "config": config,
+                "split": split,
+                "features": features,
+                "sample": sample_data,
+                "total_size": len(dataset)
+            }
+            return result
+        except Exception as e:
+            print(f"Dataset sample error: {e}")
+            return None
+    def add_benchmark(self, dataset_id, name=None, description=None, metrics=None, config=None):
+        """Add a dataset as a benchmark.
+        Args:
+            dataset_id: HuggingFace dataset ID
+            name: Benchmark name (defaults to dataset name)
+            description: Benchmark description (defaults to dataset description)
+            metrics: Metrics to use for evaluation
+            config: Dataset configuration to use
+        Returns:
+            int: Benchmark ID if successful, None otherwise
+        """
+        try:
+            # Get dataset info if name or description not provided
+            if not name or not description:
+                dataset_info = self.get_dataset_info(dataset_id)
+                if not dataset_info:
+                    return None
+                if not name:
+                    name = dataset_info["name"]
+                if not description:
+                    description = dataset_info["description"]
+            # Format dataset ID with config if provided
+            full_dataset_id = dataset_id
+            if config:
+                full_dataset_id = f"{dataset_id}:{config}"
+            # Add benchmark to database
+            benchmark_id = self.db_manager.add_benchmark(
+                name=name,
+                dataset_id=full_dataset_id,
+                description=description,
+                metrics=metrics
+            )
+            return benchmark_id
+        except Exception as e:
+            print(f"Add benchmark error: {e}")
+            return None
+    def get_benchmarks(self):
+        """Get all available benchmarks.
+        Returns:
+            list: List of benchmark information dictionaries
+        """
+        return self.db_manager.get_benchmarks()
+# Benchmark selection UI components
+def create_benchmark_selection_ui(benchmark_selector, auth_manager):
+    """Create the benchmark selection UI components.
+    Args:
+        benchmark_selector: Benchmark selector instance
+        auth_manager: Authentication manager instance
+    Returns:
+        gr.Blocks: Gradio Blocks component with benchmark selection UI
+    """
+    with gr.Blocks() as benchmark_ui:
+        gr.Markdown("## 📊 Dynamic Highscores Benchmark Selection")
+        gr.Markdown("""
+        ### Add your own datasets from HuggingFace as benchmarks!
+        You can add any dataset from HuggingFace to use as a benchmark for evaluating models.
+        Simply enter the dataset ID (e.g., 'squad', 'glue', 'hellaswag') and add it as a benchmark.
+        Other users will be able to select your added benchmarks for their model evaluations.
+        """, elem_classes=["info-text"])
+        with gr.Tabs() as tabs:
+            with gr.TabItem("➕ Add New Benchmark", id=0):
+                with gr.Row():
+                    with gr.Column(scale=3):
+                        search_input = gr.Textbox(
+                            placeholder="Search for datasets on HuggingFace...",
+                            label="Search",
+                            show_label=False
+                        )
+                    with gr.Column(scale=1):
+                        category_dropdown = gr.Dropdown(
+                            choices=benchmark_selector.categories,
+                            value="All",
+                            label="Category"
+                        )
+                    with gr.Column(scale=1):
+                        search_button = gr.Button("Search")
+                dataset_results = gr.Dataframe(
+                    headers=["Name", "Author", "Description", "Downloads"],
+                    datatype=["str", "str", "str", "number"],
+                    label="Search Results",
+                    interactive=True
+                )
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        dataset_id_input = gr.Textbox(
+                            placeholder="Enter HuggingFace dataset ID (e.g., 'squad', 'glue', 'hellaswag')",
+                            label="Dataset ID",
+                            info="You can enter any dataset ID from HuggingFace"
+                        )
+                    with gr.Column(scale=1):
+                        view_button = gr.Button("View Dataset Details")
+                with gr.Accordion("Dataset Details", open=False):
+                    dataset_info = gr.JSON(label="Dataset Information")
+                    with gr.Row():
+                        config_dropdown = gr.Dropdown(
+                            label="Configuration",
+                            choices=[],
+                            interactive=True
+                        )
+                        split_dropdown = gr.Dropdown(
+                            label="Split",
+                            choices=["train", "validation", "test"],
+                            value="train",
+                            interactive=True
+                        )
+                        sample_button = gr.Button("Load Sample")
+                    sample_data = gr.Dataframe(
+                        label="Sample Data",
+                        interactive=False
+                    )
+                gr.Markdown("### Add this dataset as a benchmark")
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        benchmark_name = gr.Textbox(
+                            placeholder="Enter a name for this benchmark",
+                            label="Benchmark Name",
+                            info="A descriptive name for this benchmark"
+                        )
+                        benchmark_description = gr.Textbox(
+                            placeholder="Enter a description for this benchmark",
+                            label="Description",
+                            info="Explain what this benchmark evaluates",
+                            lines=3
+                        )
+                    with gr.Column(scale=1):
+                        metrics_input = gr.CheckboxGroup(
+                            label="Evaluation Metrics",
+                            choices=[],
+                            interactive=True,
+                            info="Select metrics to use for evaluation"
+                        )
+                with gr.Row():
+                    add_benchmark_button = gr.Button("Add as Benchmark", size="lg", variant="primary")
+                benchmark_status = gr.Markdown("")
+            with gr.TabItem("📋 Available Benchmarks", id=1):
+                gr.Markdown("### Benchmarks available for model evaluation")
+                gr.Markdown("These benchmarks can be selected when submitting models for evaluation.")
+                with gr.Row():
+                    refresh_benchmarks_button = gr.Button("Refresh Benchmarks")
+                benchmarks_container = gr.Column()
+                with benchmarks_container:
+                    no_benchmarks_message = gr.Markdown(
+                        "### No Datasets Added Yet\n\nBe the first to add a benchmark dataset! Go to the 'Add New Benchmark' tab to add a dataset from HuggingFace.",
+                        visible=True
+                    )
+                    my_benchmarks = gr.Dataframe(
+                        headers=["ID", "Name", "Dataset", "Description"],
+                        label="Available Benchmarks",
+                        interactive=True,
+                        visible=False
+                    )
+        # Event handlers
+        def search_datasets_handler(query, category):
+            if not query:
+                return None
+            results = benchmark_selector.search_datasets(query, category)
+            # Format for dataframe
+            formatted_results = []
+            for result in results:
+                formatted_results.append([
+                    result["name"],
+                    result["author"],
+                    result["description"],
+                    result["downloads"]
+                ])
+            return formatted_results
+        def view_dataset_handler(dataset_id):
+            if not dataset_id:
+                return None, [], None
+            dataset_info = benchmark_selector.get_dataset_info(dataset_id)
+            if not dataset_info:
+                return None, [], None
+            # Update metrics based on dataset tags
+            metrics = []
+            for category, category_metrics in benchmark_selector.metric_templates.items():
+                if any(tag.lower() in [t.lower() for t in dataset_info["tags"]] for tag in category.lower().split()):
+                    metrics.extend(category_metrics)
+            # Remove duplicates
+            metrics = list(set(metrics))
+            return dataset_info, dataset_info["configs"], gr.update(choices=metrics)
+        def load_sample_handler(dataset_id, config, split):
+            if not dataset_id:
+                return None
+            sample_info = benchmark_selector.load_dataset_sample(
+                dataset_id,
+                config=config if config else None,
+                split=split
+            )
+            if not sample_info:
+                return None
+            return sample_info["sample"]
+        def add_benchmark_handler(dataset_id, config, name, description, metrics, request: gr.Request):
+            if not dataset_id:
+                return "Please enter a dataset ID from HuggingFace."
+            # Check if user is logged in
+            user = auth_manager.check_login(request)
+            if not user:
+                return "Please log in to add benchmarks."
+            # Add benchmark
+            benchmark_id = benchmark_selector.add_benchmark(
+                dataset_id=dataset_id,
+                name=name if name else None,
+                description=description if description else None,
+                metrics=metrics if metrics else None,
+                config=config if config else None
+            )
+            if benchmark_id:
+                return f"✅ Benchmark added successfully with ID: {benchmark_id}\n\nThis dataset is now available for model evaluation. You can view it in the 'Available Benchmarks' tab."
+            else:
+                return "❌ Failed to add benchmark. Please check the dataset ID and try again."
+        def get_benchmarks_handler(request: gr.Request):
+            # Check if user is logged in
+            user = auth_manager.check_login(request)
+            if not user:
+                return gr.update(visible=True), gr.update(visible=False), None
+            # Get benchmarks
+            benchmarks = benchmark_selector.get_benchmarks()
+            # If no benchmarks, show message
+            if not benchmarks or len(benchmarks) == 0:
+                return gr.update(visible=True), gr.update(visible=False), None
+            # Format for dataframe
+            formatted_benchmarks = []
+            for benchmark in benchmarks:
+                formatted_benchmarks.append([
+                    benchmark["id"],
+                    benchmark["name"],
+                    benchmark["dataset_id"],
+                    benchmark["description"]
+                ])
+            return gr.update(visible=False), gr.update(visible=True), formatted_benchmarks
+        # Connect event handlers
+        search_button.click(
+            fn=search_datasets_handler,
+            inputs=[search_input, category_dropdown],
+            outputs=[dataset_results]
+        )
+        view_button.click(
+            fn=view_dataset_handler,
+            inputs=[dataset_id_input],
+            outputs=[dataset_info, config_dropdown, metrics_input]
+        )
+        sample_button.click(
+            fn=load_sample_handler,
+            inputs=[dataset_id_input, config_dropdown, split_dropdown],
+            outputs=[sample_data]
+        )
+        add_benchmark_button.click(
+            fn=add_benchmark_handler,
+            inputs=[dataset_id_input, config_dropdown, benchmark_name, benchmark_description, metrics_input],
+            outputs=[benchmark_status]
+        )
+        refresh_benchmarks_button.click(
+            fn=get_benchmarks_handler,
+            inputs=[],
+            outputs=[no_benchmarks_message, my_benchmarks, my_benchmarks]
+        )
+        # Initialize benchmarks on load
+        benchmark_ui.load(
+            fn=get_benchmarks_handler,
+            inputs=[],
+            outputs=[no_benchmarks_message, my_benchmarks, my_benchmarks]
+        )
+    return benchmark_ui

database_schema.py ADDED Viewed

	@@ -0,0 +1,393 @@

+"""
+Database schema for Dynamic Highscores system.
+This module defines the SQLite database schema for the Dynamic Highscores system,
+which integrates benchmark selection, model evaluation, and leaderboard functionality.
+"""
+import sqlite3
+import os
+import json
+from datetime import datetime, timedelta
+import pandas as pd
+class DynamicHighscoresDB:
+    """Database manager for the Dynamic Highscores system."""
+    def __init__(self, db_path="dynamic_highscores.db"):
+        """Initialize the database connection and create tables if they don't exist."""
+        self.db_path = db_path
+        self.conn = None
+        self.cursor = None
+        self.connect()
+        self.create_tables()
+    def connect(self):
+        """Connect to the SQLite database."""
+        self.conn = sqlite3.connect(self.db_path)
+        self.conn.row_factory = sqlite3.Row
+        self.cursor = self.conn.cursor()
+    def close(self):
+        """Close the database connection."""
+        if self.conn:
+            self.conn.close()
+    def create_tables(self):
+        """Create all necessary tables if they don't exist."""
+        # Users table - stores user information
+        self.cursor.execute('''
+        CREATE TABLE IF NOT EXISTS users (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            username TEXT UNIQUE NOT NULL,
+            hf_user_id TEXT UNIQUE NOT NULL,
+            is_admin BOOLEAN DEFAULT 0,
+            last_submission_date TEXT,
+            created_at TEXT DEFAULT CURRENT_TIMESTAMP
+        )
+        ''')
+        # Benchmarks table - stores information about available benchmarks
+        self.cursor.execute('''
+        CREATE TABLE IF NOT EXISTS benchmarks (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            name TEXT NOT NULL,
+            dataset_id TEXT NOT NULL,
+            description TEXT,
+            metrics TEXT,  -- JSON string of metrics
+            created_at TEXT DEFAULT CURRENT_TIMESTAMP
+        )
+        ''')
+        # Models table - stores information about submitted models
+        self.cursor.execute('''
+        CREATE TABLE IF NOT EXISTS models (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            name TEXT NOT NULL,
+            hf_model_id TEXT NOT NULL,
+            user_id INTEGER NOT NULL,
+            tag TEXT NOT NULL,  -- One of: Merge, Agent, Reasoning, Coding, etc.
+            parameters TEXT,  -- Number of parameters (can be NULL)
+            description TEXT,
+            created_at TEXT DEFAULT CURRENT_TIMESTAMP,
+            FOREIGN KEY (user_id) REFERENCES users (id),
+            UNIQUE (hf_model_id, user_id)
+        )
+        ''')
+        # Evaluations table - stores evaluation results
+        self.cursor.execute('''
+        CREATE TABLE IF NOT EXISTS evaluations (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            model_id INTEGER NOT NULL,
+            benchmark_id INTEGER NOT NULL,
+            status TEXT NOT NULL,  -- pending, running, completed, failed
+            results TEXT,  -- JSON string of results
+            score REAL,  -- Overall score (can be NULL)
+            submitted_at TEXT DEFAULT CURRENT_TIMESTAMP,
+            completed_at TEXT,
+            FOREIGN KEY (model_id) REFERENCES models (id),
+            FOREIGN KEY (benchmark_id) REFERENCES benchmarks (id)
+        )
+        ''')
+        # Queue table - stores evaluation queue
+        self.cursor.execute('''
+        CREATE TABLE IF NOT EXISTS queue (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            evaluation_id INTEGER NOT NULL,
+            priority INTEGER DEFAULT 0,  -- Higher number = higher priority
+            added_at TEXT DEFAULT CURRENT_TIMESTAMP,
+            FOREIGN KEY (evaluation_id) REFERENCES evaluations (id)
+        )
+        ''')
+        self.conn.commit()
+    # User management methods
+    def add_user(self, username, hf_user_id, is_admin=False):
+        """Add a new user to the database."""
+        try:
+            self.cursor.execute(
+                "INSERT INTO users (username, hf_user_id, is_admin) VALUES (?, ?, ?)",
+                (username, hf_user_id, is_admin)
+            )
+            self.conn.commit()
+            return self.cursor.lastrowid
+        except sqlite3.IntegrityError:
+            # User already exists
+            self.cursor.execute(
+                "SELECT id FROM users WHERE hf_user_id = ?",
+                (hf_user_id,)
+            )
+            return self.cursor.fetchone()[0]
+    def get_user(self, hf_user_id):
+        """Get user information by HuggingFace user ID."""
+        self.cursor.execute(
+            "SELECT * FROM users WHERE hf_user_id = ?",
+            (hf_user_id,)
+        )
+        return dict(self.cursor.fetchone()) if self.cursor.fetchone() else None
+    def can_submit_today(self, user_id):
+        """Check if a user can submit a benchmark evaluation today."""
+        self.cursor.execute(
+            "SELECT is_admin, last_submission_date FROM users WHERE id = ?",
+            (user_id,)
+        )
+        result = self.cursor.fetchone()
+        if not result:
+            return False
+        user_data = dict(result)
+        # Admin can always submit
+        if user_data['is_admin']:
+            return True
+        # If no previous submission, user can submit
+        if not user_data['last_submission_date']:
+            return True
+        # Check if last submission was before today
+        last_date = datetime.fromisoformat(user_data['last_submission_date'])
+        today = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
+        return last_date < today
+    def update_submission_date(self, user_id):
+        """Update the last submission date for a user."""
+        current_time = datetime.now().isoformat()
+        self.cursor.execute(
+            "UPDATE users SET last_submission_date = ? WHERE id = ?",
+            (current_time, user_id)
+        )
+        self.conn.commit()
+    # Benchmark management methods
+    def add_benchmark(self, name, dataset_id, description="", metrics=None):
+        """Add a new benchmark to the database."""
+        if metrics is None:
+            metrics = {}
+        metrics_json = json.dumps(metrics)
+        try:
+            self.cursor.execute(
+                "INSERT INTO benchmarks (name, dataset_id, description, metrics) VALUES (?, ?, ?, ?)",
+                (name, dataset_id, description, metrics_json)
+            )
+            self.conn.commit()
+            return self.cursor.lastrowid
+        except sqlite3.IntegrityError:
+            # Benchmark already exists with this dataset_id
+            self.cursor.execute(
+                "SELECT id FROM benchmarks WHERE dataset_id = ?",
+                (dataset_id,)
+            )
+            return self.cursor.fetchone()[0]
+    def get_benchmarks(self):
+        """Get all available benchmarks."""
+        self.cursor.execute("SELECT * FROM benchmarks")
+        benchmarks = [dict(row) for row in self.cursor.fetchall()]
+        # Parse metrics JSON
+        for benchmark in benchmarks:
+            benchmark['metrics'] = json.loads(benchmark['metrics'])
+        return benchmarks
+    def get_benchmark(self, benchmark_id):
+        """Get benchmark information by ID."""
+        self.cursor.execute(
+            "SELECT * FROM benchmarks WHERE id = ?",
+            (benchmark_id,)
+        )
+        benchmark = dict(self.cursor.fetchone()) if self.cursor.fetchone() else None
+        if benchmark:
+            benchmark['metrics'] = json.loads(benchmark['metrics'])
+        return benchmark
+    # Model management methods
+    def add_model(self, name, hf_model_id, user_id, tag, parameters=None, description=""):
+        """Add a new model to the database."""
+        try:
+            self.cursor.execute(
+                "INSERT INTO models (name, hf_model_id, user_id, tag, parameters, description) VALUES (?, ?, ?, ?, ?, ?)",
+                (name, hf_model_id, user_id, tag, parameters, description)
+            )
+            self.conn.commit()
+            return self.cursor.lastrowid
+        except sqlite3.IntegrityError:
+            # Model already exists for this user
+            self.cursor.execute(
+                "SELECT id FROM models WHERE hf_model_id = ? AND user_id = ?",
+                (hf_model_id, user_id)
+            )
+            return self.cursor.fetchone()[0]
+    def get_models(self, tag=None):
+        """Get all models, optionally filtered by tag."""
+        if tag:
+            self.cursor.execute(
+                "SELECT * FROM models WHERE tag = ?",
+                (tag,)
+            )
+        else:
+            self.cursor.execute("SELECT * FROM models")
+        return [dict(row) for row in self.cursor.fetchall()]
+    def get_model(self, model_id):
+        """Get model information by ID."""
+        self.cursor.execute(
+            "SELECT * FROM models WHERE id = ?",
+            (model_id,)
+        )
+        return dict(self.cursor.fetchone()) if self.cursor.fetchone() else None
+    # Evaluation management methods
+    def add_evaluation(self, model_id, benchmark_id, priority=0):
+        """Add a new evaluation to the database and queue."""
+        # First, add the evaluation
+        self.cursor.execute(
+            "INSERT INTO evaluations (model_id, benchmark_id, status) VALUES (?, ?, 'pending')",
+            (model_id, benchmark_id)
+        )
+        evaluation_id = self.cursor.lastrowid
+        # Then, add it to the queue
+        self.cursor.execute(
+            "INSERT INTO queue (evaluation_id, priority) VALUES (?, ?)",
+            (evaluation_id, priority)
+        )
+        self.conn.commit()
+        return evaluation_id
+    def update_evaluation_status(self, evaluation_id, status, results=None, score=None):
+        """Update the status of an evaluation."""
+        params = [status, evaluation_id]
+        sql = "UPDATE evaluations SET status = ?"
+        if results is not None:
+            sql += ", results = ?"
+            params.insert(1, json.dumps(results))
+        if score is not None:
+            sql += ", score = ?"
+            params.insert(1 if results is None else 2, score)
+        if status in ['completed', 'failed']:
+            sql += ", completed_at = ?"
+            params.insert(1 if results is None and score is None else (2 if results is None or score is None else 3),
+                         datetime.now().isoformat())
+        sql += " WHERE id = ?"
+        self.cursor.execute(sql, params)
+        self.conn.commit()
+        # If completed or failed, remove from queue
+        if status in ['completed', 'failed']:
+            self.cursor.execute(
+                "DELETE FROM queue WHERE evaluation_id = ?",
+                (evaluation_id,)
+            )
+            self.conn.commit()
+    def get_next_in_queue(self):
+        """Get the next evaluation in the queue."""
+        self.cursor.execute("""
+            SELECT q.id as queue_id, q.evaluation_id, e.model_id, e.benchmark_id, m.hf_model_id, b.dataset_id
+            FROM queue q
+            JOIN evaluations e ON q.evaluation_id = e.id
+            JOIN models m ON e.model_id = m.id
+            JOIN benchmarks b ON e.benchmark_id = b.id
+            WHERE e.status = 'pending'
+            ORDER BY q.priority DESC, q.added_at ASC
+            LIMIT 1
+        """)
+        result = self.cursor.fetchone()
+        return dict(result) if result else None
+    def get_evaluation_results(self, model_id=None, benchmark_id=None, tag=None):
+        """Get evaluation results, optionally filtered by model, benchmark, or tag."""
+        sql = """
+            SELECT e.id, e.model_id, e.benchmark_id, e.status, e.results, e.score,
+                   e.submitted_at, e.completed_at, m.name as model_name, m.tag,
+                   b.name as benchmark_name
+            FROM evaluations e
+            JOIN models m ON e.model_id = m.id
+            JOIN benchmarks b ON e.benchmark_id = b.id
+            WHERE e.status = 'completed'
+        """
+        params = []
+        if model_id:
+            sql += " AND e.model_id = ?"
+            params.append(model_id)
+        if benchmark_id:
+            sql += " AND e.benchmark_id = ?"
+            params.append(benchmark_id)
+        if tag:
+            sql += " AND m.tag = ?"
+            params.append(tag)
+        sql += " ORDER BY e.completed_at DESC"
+        self.cursor.execute(sql, params)
+        results = [dict(row) for row in self.cursor.fetchall()]
+        # Parse results JSON
+        for result in results:
+            if result['results']:
+                result['results'] = json.loads(result['results'])
+        return results
+    def get_leaderboard_df(self, tag=None):
+        """Get a pandas DataFrame of the leaderboard, optionally filtered by tag."""
+        results = self.get_evaluation_results(tag=tag)
+        if not results:
+            return pd.DataFrame()
+        # Create a list of dictionaries for the DataFrame
+        leaderboard_data = []
+        for result in results:
+            entry = {
+                'model_name': result['model_name'],
+                'model_id': result['model_id'],
+                'benchmark_name': result['benchmark_name'],
+                'benchmark_id': result['benchmark_id'],
+                'tag': result['tag'],
+                'score': result['score'],
+                'completed_at': result['completed_at']
+            }
+            # Add individual metrics from results
+            if result['results'] and isinstance(result['results'], dict):
+                for metric, value in result['results'].items():
+                    if isinstance(value, (int, float)):
+                        entry[f'metric_{metric}'] = value
+            leaderboard_data.append(entry)
+        return pd.DataFrame(leaderboard_data)
+# Initialize the database
+def init_db(db_path="dynamic_highscores.db"):
+    """Initialize the database and return the database manager."""
+    db = DynamicHighscoresDB(db_path)
+    return db

evaluation_queue.py ADDED Viewed

	@@ -0,0 +1,947 @@

+"""
+Model evaluation queue system for Dynamic Highscores.
+This module handles the evaluation queue, CPU-only processing,
+and enforces daily submission limits for users.
+"""
+import os
+import json
+import time
+import threading
+import queue
+from datetime import datetime, timedelta
+import gradio as gr
+from huggingface_hub import HfApi, hf_hub_download, snapshot_download
+from datasets import load_dataset
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+import sqlite3
+class EvaluationQueue:
+    """Manages the evaluation queue for model benchmarking."""
+    def __init__(self, db_manager, auth_manager):
+        """Initialize the evaluation queue manager.
+        Args:
+            db_manager: Database manager instance
+            auth_manager: Authentication manager instance
+        """
+        self.db_manager = db_manager
+        self.auth_manager = auth_manager
+        self.hf_api = HfApi()
+        self.queue = queue.Queue()
+        self.is_processing = False
+        self.worker_thread = None
+        self.model_tags = ["Merge", "Agent", "Reasoning", "Coding", "General", "Specialized", "Instruction", "Chat"]
+        self.current_evaluation = None
+        self.progress = 0
+        self.progress_lock = threading.Lock()
+        self.db_path = db_manager.db_path  # Store the path to create new connections in worker thread
+    def start_worker(self):
+        """Start the worker thread for processing the evaluation queue."""
+        if self.worker_thread is None or not self.worker_thread.is_alive():
+            self.is_processing = True
+            self.worker_thread = threading.Thread(target=self._process_queue)
+            self.worker_thread.daemon = True
+            self.worker_thread.start()
+    def stop_worker(self):
+        """Stop the worker thread."""
+        self.is_processing = False
+        if self.worker_thread and self.worker_thread.is_alive():
+            self.worker_thread.join(timeout=1.0)
+    def _process_queue(self):
+        """Process the evaluation queue in a separate thread."""
+        # Create a new database connection for this thread
+        thread_db = sqlite3.connect(self.db_path)
+        thread_db.row_factory = sqlite3.Row
+        while self.is_processing:
+            try:
+                # Get the next evaluation from the database using thread-local connection
+                cursor = thread_db.cursor()
+                cursor.execute("""
+                    SELECT e.id as evaluation_id, e.model_id, e.benchmark_id, m.hf_model_id, b.dataset_id
+                    FROM queue q
+                    JOIN evaluations e ON q.evaluation_id = e.id
+                    JOIN models m ON e.model_id = m.id
+                    JOIN benchmarks b ON e.benchmark_id = b.id
+                    WHERE e.status = 'pending'
+                    ORDER BY q.priority DESC, q.created_at ASC
+                    LIMIT 1
+                """)
+                row = cursor.fetchone()
+                if row:
+                    next_eval = dict(row)
+                    # Update status to running
+                    cursor.execute("""
+                        UPDATE evaluations
+                        SET status = 'running', started_at = datetime('now')
+                        WHERE id = ?
+                    """, (next_eval['evaluation_id'],))
+                    thread_db.commit()
+                    # Set current evaluation and reset progress
+                    with self.progress_lock:
+                        self.current_evaluation = next_eval
+                        self.progress = 0
+                    try:
+                        # Run the evaluation
+                        results = self._run_evaluation(
+                            next_eval['hf_model_id'],
+                            next_eval['dataset_id']
+                        )
+                        # Calculate overall score
+                        score = self._calculate_overall_score(results)
+                        # Update status to completed with results
+                        cursor.execute("""
+                            UPDATE evaluations
+                            SET status = 'completed',
+                                completed_at = datetime('now'),
+                                results = ?,
+                                score = ?
+                            WHERE id = ?
+                        """, (json.dumps(results), score, next_eval['evaluation_id']))
+                        thread_db.commit()
+                    except Exception as e:
+                        print(f"Evaluation error: {e}")
+                        # Update status to failed
+                        cursor.execute("""
+                            UPDATE evaluations
+                            SET status = 'failed', completed_at = datetime('now')
+                            WHERE id = ?
+                        """, (next_eval['evaluation_id'],))
+                        thread_db.commit()
+                    # Clear current evaluation
+                    with self.progress_lock:
+                        self.current_evaluation = None
+                        self.progress = 0
+                else:
+                    # No evaluations in queue, sleep for a bit
+                    time.sleep(5)
+            except Exception as e:
+                print(f"Queue processing error: {e}")
+                time.sleep(5)
+        # Close the thread-local database connection
+        thread_db.close()
+    def _run_evaluation(self, model_id, dataset_id):
+        """Run an evaluation for a model on a benchmark.
+        Args:
+            model_id: HuggingFace model ID
+            dataset_id: HuggingFace dataset ID (with optional config)
+        Returns:
+            dict: Evaluation results
+        """
+        # Update progress
+        with self.progress_lock:
+            self.progress = 5  # Starting evaluation
+        # Parse dataset ID and config
+        if ":" in dataset_id:
+            dataset_id, config = dataset_id.split(":", 1)
+        else:
+            config = None
+        # Update progress
+        with self.progress_lock:
+            self.progress = 10  # Loading dataset
+        # Load the dataset
+        if config:
+            dataset = load_dataset(dataset_id, config, split="test")
+        else:
+            dataset = load_dataset(dataset_id, split="test")
+        # Update progress
+        with self.progress_lock:
+            self.progress = 20  # Loading model
+        # Load the model (CPU only)
+        device = "cpu"
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            device_map=device,
+            torch_dtype=torch.float32,  # Use float32 for CPU
+            low_cpu_mem_usage=True
+        )
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        # Update progress
+        with self.progress_lock:
+            self.progress = 30  # Determining task type
+        # Determine task type based on dataset features
+        task_type = self._determine_task_type(dataset)
+        # Update progress
+        with self.progress_lock:
+            self.progress = 40  # Starting evaluation
+        # Run appropriate evaluation based on task type
+        if task_type == "text-generation":
+            results = self._evaluate_text_generation(model, tokenizer, dataset)
+        elif task_type == "question-answering":
+            results = self._evaluate_question_answering(model, tokenizer, dataset)
+        elif task_type == "classification":
+            results = self._evaluate_classification(model, tokenizer, dataset)
+        elif task_type == "code-generation":
+            results = self._evaluate_code_generation(model, tokenizer, dataset)
+        else:
+            # Default to general evaluation
+            results = self._evaluate_general(model, tokenizer, dataset)
+        # Update progress
+        with self.progress_lock:
+            self.progress = 95  # Cleaning up
+        # Clean up to free memory
+        del model
+        del tokenizer
+        torch.cuda.empty_cache()
+        # Update progress
+        with self.progress_lock:
+            self.progress = 100  # Completed
+        return results
+    def get_current_progress(self):
+        """Get the current evaluation progress.
+        Returns:
+            tuple: (current_evaluation, progress_percentage)
+        """
+        with self.progress_lock:
+            return self.current_evaluation, self.progress
+    def _determine_task_type(self, dataset):
+        """Determine the task type based on dataset features.
+        Args:
+            dataset: HuggingFace dataset
+        Returns:
+            str: Task type
+        """
+        features = dataset.features
+        # Check for common feature patterns
+        if "question" in features and "answer" in features:
+            return "question-answering"
+        elif "code" in features or "solution" in features:
+            return "code-generation"
+        elif "label" in features or "class" in features:
+            return "classification"
+        elif "input" in features and "output" in features:
+            return "text-generation"
+        else:
+            return "general"
+    def _evaluate_text_generation(self, model, tokenizer, dataset):
+        """Evaluate a model on text generation tasks.
+        Args:
+            model: HuggingFace model
+            tokenizer: HuggingFace tokenizer
+            dataset: HuggingFace dataset
+        Returns:
+            dict: Evaluation results
+        """
+        # Set up generation pipeline
+        generator = pipeline(
+            "text-generation",
+            model=model,
+            tokenizer=tokenizer,
+            device="cpu"
+        )
+        # Sample a subset for evaluation (to keep runtime reasonable)
+        if len(dataset) > 100:
+            dataset = dataset.select(range(100))
+        # Track metrics
+        correct = 0
+        total = 0
+        generated_texts = []
+        # Process each example
+        for i, example in enumerate(dataset):
+            # Update progress based on completion percentage
+            with self.progress_lock:
+                self.progress = 40 + int((i / len(dataset)) * 50)
+            input_text = example.get("input", example.get("prompt", ""))
+            expected_output = example.get("output", example.get("target", ""))
+            if not input_text or not expected_output:
+                continue
+            # Generate text
+            generated = generator(
+                input_text,
+                max_length=100,
+                num_return_sequences=1
+            )
+            generated_text = generated[0]["generated_text"]
+            generated_texts.append(generated_text)
+            # Simple exact match check
+            if expected_output.strip() in generated_text:
+                correct += 1
+            total += 1
+        # Calculate metrics
+        accuracy = correct / total if total > 0 else 0
+        return {
+            "accuracy": accuracy,
+            "samples_evaluated": total,
+            "generated_samples": generated_texts[:5]  # Include a few samples
+        }
+    def _evaluate_question_answering(self, model, tokenizer, dataset):
+        """Evaluate a model on question answering tasks.
+        Args:
+            model: HuggingFace model
+            tokenizer: HuggingFace tokenizer
+            dataset: HuggingFace dataset
+        Returns:
+            dict: Evaluation results
+        """
+        # Set up QA pipeline
+        qa_pipeline = pipeline(
+            "question-answering",
+            model=model,
+            tokenizer=tokenizer,
+            device="cpu"
+        )
+        # Sample a subset for evaluation
+        if len(dataset) > 100:
+            dataset = dataset.select(range(100))
+        # Track metrics
+        exact_matches = 0
+        f1_scores = []
+        total = 0
+        # Process each example
+        for i, example in enumerate(dataset):
+            # Update progress based on completion percentage
+            with self.progress_lock:
+                self.progress = 40 + int((i / len(dataset)) * 50)
+            question = example.get("question", "")
+            context = example.get("context", "")
+            answer = example.get("answer", "")
+            if not question or not answer:
+                continue
+            # Get model prediction
+            if context:
+                result = qa_pipeline(question=question, context=context)
+            else:
+                # If no context provided, use the question as context
+                result = qa_pipeline(question=question, context=question)
+            predicted_answer = result["answer"]
+            # Calculate exact match
+            if predicted_answer.strip() == answer.strip():
+                exact_matches += 1
+            # Calculate F1 score
+            f1 = self._calculate_f1(answer, predicted_answer)
+            f1_scores.append(f1)
+            total += 1
+        # Calculate metrics
+        exact_match_accuracy = exact_matches / total if total > 0 else 0
+        avg_f1 = sum(f1_scores) / len(f1_scores) if f1_scores else 0
+        return {
+            "exact_match": exact_match_accuracy,
+            "f1": avg_f1,
+            "samples_evaluated": total
+        }
+    def _evaluate_classification(self, model, tokenizer, dataset):
+        """Evaluate a model on classification tasks.
+        Args:
+            model: HuggingFace model
+            tokenizer: HuggingFace tokenizer
+            dataset: HuggingFace dataset
+        Returns:
+            dict: Evaluation results
+        """
+        # Set up classification pipeline
+        classifier = pipeline(
+            "text-classification",
+            model=model,
+            tokenizer=tokenizer,
+            device="cpu"
+        )
+        # Sample a subset for evaluation
+        if len(dataset) > 100:
+            dataset = dataset.select(range(100))
+        # Track metrics
+        correct = 0
+        total = 0
+        # Process each example
+        for i, example in enumerate(dataset):
+            # Update progress based on completion percentage
+            with self.progress_lock:
+                self.progress = 40 + int((i / len(dataset)) * 50)
+            text = example.get("text", example.get("sentence", ""))
+            label = str(example.get("label", example.get("class", "")))
+            if not text or not label:
+                continue
+            # Get model prediction
+            result = classifier(text)
+            predicted_label = result[0]["label"]
+            # Check if correct
+            if str(predicted_label) == label:
+                correct += 1
+            total += 1
+        # Calculate metrics
+        accuracy = correct / total if total > 0 else 0
+        return {
+            "accuracy": accuracy,
+            "samples_evaluated": total
+        }
+    def _evaluate_code_generation(self, model, tokenizer, dataset):
+        """Evaluate a model on code generation tasks.
+        Args:
+            model: HuggingFace model
+            tokenizer: HuggingFace tokenizer
+            dataset: HuggingFace dataset
+        Returns:
+            dict: Evaluation results
+        """
+        # Set up generation pipeline
+        generator = pipeline(
+            "text-generation",
+            model=model,
+            tokenizer=tokenizer,
+            device="cpu"
+        )
+        # Sample a subset for evaluation
+        if len(dataset) > 50:  # Smaller sample for code tasks
+            dataset = dataset.select(range(50))
+        # Track metrics
+        exact_matches = 0
+        functional_matches = 0
+        total = 0
+        # Process each example
+        for i, example in enumerate(dataset):
+            # Update progress based on completion percentage
+            with self.progress_lock:
+                self.progress = 40 + int((i / len(dataset)) * 50)
+            prompt = example.get("prompt", example.get("input", ""))
+            solution = example.get("solution", example.get("output", ""))
+            if not prompt or not solution:
+                continue
+            # Generate code
+            generated = generator(
+                prompt,
+                max_length=200,
+                num_return_sequences=1
+            )
+            generated_code = generated[0]["generated_text"]
+            # Extract code from generated text (remove prompt)
+            if prompt in generated_code:
+                generated_code = generated_code[len(prompt):].strip()
+            # Check exact match
+            if generated_code.strip() == solution.strip():
+                exact_matches += 1
+                functional_matches += 1
+            else:
+                # We would ideally check functional correctness here
+                # but that requires executing code which is complex and potentially unsafe
+                # For now, we'll use a simple heuristic
+                if len(generated_code) > 0 and any(keyword in generated_code for keyword in ["def ", "function", "return", "class"]):
+                    functional_matches += 0.5  # Partial credit
+            total += 1
+        # Calculate metrics
+        exact_match_rate = exact_matches / total if total > 0 else 0
+        functional_correctness = functional_matches / total if total > 0 else 0
+        return {
+            "exact_match": exact_match_rate,
+            "functional_correctness": functional_correctness,
+            "samples_evaluated": total
+        }
+    def _evaluate_general(self, model, tokenizer, dataset):
+        """General evaluation for any dataset type.
+        Args:
+            model: HuggingFace model
+            tokenizer: HuggingFace tokenizer
+            dataset: HuggingFace dataset
+        Returns:
+            dict: Evaluation results
+        """
+        # Set up generation pipeline
+        generator = pipeline(
+            "text-generation",
+            model=model,
+            tokenizer=tokenizer,
+            device="cpu"
+        )
+        # Sample a subset for evaluation
+        if len(dataset) > 50:
+            dataset = dataset.select(range(50))
+        # Find input and output fields
+        features = dataset.features
+        input_field = None
+        output_field = None
+        for field in features:
+            if field.lower() in ["input", "prompt", "question", "text"]:
+                input_field = field
+            elif field.lower() in ["output", "target", "answer", "response"]:
+                output_field = field
+        if not input_field:
+            # Just use the first string field as input
+            for field in features:
+                if isinstance(features[field], (str, list)):
+                    input_field = field
+                    break
+        # Track metrics
+        total = 0
+        generated_texts = []
+        # Process each example
+        for i, example in enumerate(dataset):
+            # Update progress based on completion percentage
+            with self.progress_lock:
+                self.progress = 40 + int((i / len(dataset)) * 50)
+            if input_field and input_field in example:
+                input_text = str(example[input_field])
+                # Generate text
+                generated = generator(
+                    input_text,
+                    max_length=100,
+                    num_return_sequences=1
+                )
+                generated_text = generated[0]["generated_text"]
+                generated_texts.append({
+                    "input": input_text,
+                    "output": generated_text,
+                    "expected": str(example[output_field]) if output_field and output_field in example else "N/A"
+                })
+                total += 1
+        return {
+            "samples_evaluated": total,
+            "generated_samples": generated_texts[:5]  # Include a few samples
+        }
+    def _calculate_f1(self, answer, prediction):
+        """Calculate F1 score between answer and prediction.
+        Args:
+            answer: Ground truth answer
+            prediction: Model prediction
+        Returns:
+            float: F1 score
+        """
+        # Tokenize
+        answer_tokens = answer.lower().split()
+        prediction_tokens = prediction.lower().split()
+        # Calculate precision and recall
+        common_tokens = set(answer_tokens) & set(prediction_tokens)
+        if not common_tokens:
+            return 0.0
+        precision = len(common_tokens) / len(prediction_tokens)
+        recall = len(common_tokens) / len(answer_tokens)
+        # Calculate F1
+        if precision + recall == 0:
+            return 0.0
+        f1 = 2 * precision * recall / (precision + recall)
+        return f1
+    def _calculate_overall_score(self, results):
+        """Calculate an overall score from evaluation results.
+        Args:
+            results: Evaluation results dictionary
+        Returns:
+            float: Overall score between 0 and 100
+        """
+        score = 0.0
+        # Check for common metrics and weight them
+        if "accuracy" in results:
+            score += results["accuracy"] * 100
+        if "exact_match" in results:
+            score += results["exact_match"] * 100
+        if "f1" in results:
+            score += results["f1"] * 100
+        if "functional_correctness" in results:
+            score += results["functional_correctness"] * 100
+        # If multiple metrics were found, average them
+        num_metrics = sum(1 for metric in ["accuracy", "exact_match", "f1", "functional_correctness"] if metric in results)
+        if num_metrics > 0:
+            score /= num_metrics
+        else:
+            # Default score if no metrics available
+            score = 50.0
+        return score
+    def submit_evaluation(self, model_id, benchmark_id, user_id, priority=0):
+        """Submit a model for evaluation on a benchmark.
+        Args:
+            model_id: Model ID in the database
+            benchmark_id: Benchmark ID in the database
+            user_id: User ID submitting the evaluation
+            priority: Queue priority (higher = higher priority)
+        Returns:
+            int: Evaluation ID if successful, None otherwise
+        """
+        # Check if user can submit today
+        if not self.auth_manager.can_submit_benchmark(user_id):
+            return None, "Daily submission limit reached. Try again tomorrow."
+        try:
+            # Add evaluation to database and queue
+            evaluation_id = self.db_manager.add_evaluation(
+                model_id=model_id,
+                benchmark_id=benchmark_id,
+                priority=priority
+            )
+            # Update user's last submission date
+            self.auth_manager.update_submission_date(user_id)
+            # Make sure worker is running
+            self.start_worker()
+            return evaluation_id, "Evaluation submitted successfully."
+        except Exception as e:
+            print(f"Submit evaluation error: {e}")
+            return None, f"Failed to submit evaluation: {str(e)}"
+    def get_queue_status(self):
+        """Get the current status of the evaluation queue.
+        Returns:
+            dict: Queue status information
+        """
+        try:
+            # Get evaluations from database
+            pending_evals = self.db_manager.get_evaluation_results(status="pending")
+            running_evals = self.db_manager.get_evaluation_results(status="running")
+            completed_evals = self.db_manager.get_evaluation_results(status="completed")
+            failed_evals = self.db_manager.get_evaluation_results(status="failed")
+            # Get current evaluation progress
+            current_eval, progress = self.get_current_progress()
+            return {
+                "pending": len(pending_evals),
+                "running": len(running_evals),
+                "completed": len(completed_evals),
+                "failed": len(failed_evals),
+                "is_processing": self.is_processing,
+                "current_evaluation": current_eval,
+                "progress": progress
+            }
+        except Exception as e:
+            print(f"Queue status error: {e}")
+            return {
+                "pending": 0,
+                "running": 0,
+                "completed": 0,
+                "failed": 0,
+                "is_processing": self.is_processing,
+                "current_evaluation": None,
+                "progress": 0,
+                "error": str(e)
+            }
+# Model submission UI components
+def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
+    """Create the model submission UI components.
+    Args:
+        evaluation_queue: Evaluation queue instance
+        auth_manager: Authentication manager instance
+        db_manager: Database manager instance
+    Returns:
+        gr.Blocks: Gradio Blocks component with model submission UI
+    """
+    with gr.Blocks() as submission_ui:
+        with gr.Tab("Submit Model"):
+            with gr.Row():
+                with gr.Column(scale=2):
+                    model_id_input = gr.Textbox(
+                        placeholder="HuggingFace model ID (e.g., 'gpt2', 'facebook/opt-350m')",
+                        label="Model ID"
+                    )
+                    model_name_input = gr.Textbox(
+                        placeholder="Display name for your model",
+                        label="Model Name"
+                    )
+                    model_description_input = gr.Textbox(
+                        placeholder="Brief description of your model",
+                        label="Description",
+                        lines=3
+                    )
+                    model_parameters_input = gr.Number(
+                        label="Number of Parameters (billions)",
+                        precision=2
+                    )
+                with gr.Column(scale=1):
+                    model_tag_input = gr.Dropdown(
+                        choices=evaluation_queue.model_tags,
+                        label="Model Tag",
+                        info="Select one category that best describes your model"
+                    )
+                    benchmark_dropdown = gr.Dropdown(
+                        label="Benchmark",
+                        info="Select a benchmark to evaluate your model on"
+                    )
+                    refresh_benchmarks_button = gr.Button("Refresh Benchmarks")
+            submit_model_button = gr.Button("Submit for Evaluation")
+            submission_status = gr.Markdown("")
+        with gr.Tab("Evaluation Queue"):
+            refresh_queue_button = gr.Button("Refresh Queue")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    queue_stats = gr.JSON(
+                        label="Queue Statistics"
+                    )
+                with gr.Column(scale=2):
+                    queue_status = gr.Dataframe(
+                        headers=["ID", "Model", "Benchmark", "Status", "Submitted"],
+                        label="Recent Evaluations"
+                    )
+            with gr.Row(visible=True) as progress_container:
+                with gr.Column():
+                    current_eval_info = gr.Markdown("No evaluation currently running")
+                    # Use a simple text display for progress instead of Progress component
+                    progress_display = gr.Markdown("Progress: 0%")
+            # Function to update progress display
+            def update_progress_display():
+                current_eval, progress = evaluation_queue.get_current_progress()
+                if current_eval:
+                    model_info = db_manager.get_model(current_eval['model_id'])
+                    benchmark_info = db_manager.get_benchmark(current_eval['benchmark_id'])
+                    if model_info and benchmark_info:
+                        eval_info = f"**Currently Evaluating:** {model_info['name']} on {benchmark_info['name']}"
+                        progress_text = f"Progress: {progress}%"
+                        return eval_info, progress_text
+                return "No evaluation currently running", "Progress: 0%"
+        # Event handlers
+        def refresh_benchmarks_handler():
+            benchmarks = db_manager.get_benchmarks()
+            # Format for dropdown
+            choices = [(b["id"], b["name"]) for b in benchmarks]
+            return gr.update(choices=choices)
+        def submit_model_handler(model_id, model_name, model_description, model_parameters, model_tag, benchmark_id, request: gr.Request):
+            # Check if user is logged in
+            user = auth_manager.check_login(request)
+            if not user:
+                return "Please log in to submit a model."
+            if not model_id or not model_name or not model_tag or not benchmark_id:
+                return "Please fill in all required fields."
+            try:
+                # Add model to database
+                model_db_id = db_manager.add_model(
+                    name=model_name,
+                    hf_model_id=model_id,
+                    user_id=user["id"],
+                    tag=model_tag,
+                    parameters=str(model_parameters) if model_parameters else None,
+                    description=model_description
+                )
+                if not model_db_id:
+                    return "Failed to add model to database."
+                # Submit for evaluation
+                eval_id, message = evaluation_queue.submit_evaluation(
+                    model_id=model_db_id,
+                    benchmark_id=benchmark_id,
+                    user_id=user["id"]
+                )
+                if eval_id:
+                    return f"Model submitted successfully. Evaluation ID: {eval_id}"
+                else:
+                    return message
+            except Exception as e:
+                return f"Error submitting model: {str(e)}"
+        def refresh_queue_handler():
+            # Get queue statistics
+            stats = evaluation_queue.get_queue_status()
+            # Get recent evaluations
+            evals = db_manager.get_evaluation_results(limit=20)
+            # Format for dataframe
+            eval_data = []
+            for eval in evals:
+                eval_data.append([
+                    eval["id"],
+                    eval["model_name"],
+                    eval["benchmark_name"],
+                    eval["status"],
+                    eval["submitted_at"]
+                ])
+            # Also update progress display
+            current_eval, progress = evaluation_queue.get_current_progress()
+            if current_eval:
+                model_info = db_manager.get_model(current_eval['model_id'])
+                benchmark_info = db_manager.get_benchmark(current_eval['benchmark_id'])
+                if model_info and benchmark_info:
+                    eval_info = f"**Currently Evaluating:** {model_info['name']} on {benchmark_info['name']}"
+                    progress_text = f"Progress: {progress}%"
+                    return stats, eval_data, eval_info, progress_text
+            return stats, eval_data, "No evaluation currently running", "Progress: 0%"
+        # Connect event handlers
+        refresh_benchmarks_button.click(
+            fn=refresh_benchmarks_handler,
+            inputs=[],
+            outputs=[benchmark_dropdown]
+        )
+        submit_model_button.click(
+            fn=submit_model_handler,
+            inputs=[
+                model_id_input,
+                model_name_input,
+                model_description_input,
+                model_parameters_input,
+                model_tag_input,
+                benchmark_dropdown
+            ],
+            outputs=[submission_status]
+        )
+        refresh_queue_button.click(
+            fn=refresh_queue_handler,
+            inputs=[],
+            outputs=[queue_stats, queue_status, current_eval_info, progress_display]
+        )
+        # Initialize on load
+        submission_ui.load(
+            fn=refresh_benchmarks_handler,
+            inputs=[],
+            outputs=[benchmark_dropdown]
+        )
+        submission_ui.load(
+            fn=refresh_queue_handler,
+            inputs=[],
+            outputs=[queue_stats, queue_status, current_eval_info, progress_display]
+        )
+        # Manual refresh button with instructions
+        gr.Markdown("""
+        **Note:** Click the 'Refresh Queue' button periodically to update the progress display.
+        """)
+    return submission_ui

leaderboard.py ADDED Viewed

	@@ -0,0 +1,381 @@

+"""
+Leaderboard module for Dynamic Highscores system.
+This module implements the unified leaderboard with tag-based filtering
+for displaying all evaluated models.
+"""
+import os
+import json
+import pandas as pd
+import gradio as gr
+import plotly.express as px
+import plotly.graph_objects as go
+class Leaderboard:
+    """Manages the unified leaderboard with filtering capabilities."""
+    def __init__(self, db_manager):
+        """Initialize the leaderboard manager.
+        Args:
+            db_manager: Database manager instance
+        """
+        self.db_manager = db_manager
+        self.model_tags = ["All", "Merge", "Agent", "Reasoning", "Coding", "General", "Specialized", "Instruction", "Chat"]
+        # Define color scheme for tags
+        self.tag_colors = {
+            "Merge": "#FF6B6B",
+            "Agent": "#4ECDC4",
+            "Reasoning": "#FFD166",
+            "Coding": "#6B5B95",
+            "General": "#88D8B0",
+            "Specialized": "#FF8C42",
+            "Instruction": "#5D9CEC",
+            "Chat": "#AC92EB"
+        }
+    def get_leaderboard_data(self, tag=None, benchmark_id=None):
+        """Get leaderboard data, optionally filtered by tag or benchmark.
+        Args:
+            tag: Model tag to filter by (None for all)
+            benchmark_id: Benchmark ID to filter by (None for all)
+        Returns:
+            pd.DataFrame: Leaderboard data
+        """
+        # Get evaluation results from database
+        if tag and tag != "All":
+            df = self.db_manager.get_leaderboard_df(tag=tag)
+        else:
+            df = self.db_manager.get_leaderboard_df()
+        # Filter by benchmark if specified
+        if benchmark_id and not df.empty:
+            df = df[df['benchmark_id'] == benchmark_id]
+        return df
+    def format_leaderboard_for_display(self, df):
+        """Format leaderboard data for display.
+        Args:
+            df: Leaderboard DataFrame
+        Returns:
+            pd.DataFrame: Formatted leaderboard for display
+        """
+        if df.empty:
+            return pd.DataFrame()
+        # Select and rename columns for display
+        display_df = df[['model_name', 'benchmark_name', 'tag', 'score', 'completed_at']].copy()
+        display_df.columns = ['Model', 'Benchmark', 'Tag', 'Score', 'Completed']
+        # Round score to 2 decimal places
+        display_df['Score'] = display_df['Score'].round(2)
+        # Sort by score (descending)
+        display_df = display_df.sort_values('Score', ascending=False)
+        return display_df
+    def create_performance_chart(self, df, chart_type="bar"):
+        """Create a performance chart from leaderboard data.
+        Args:
+            df: Leaderboard DataFrame
+            chart_type: Type of chart to create ("bar" or "scatter")
+        Returns:
+            plotly.graph_objects.Figure: Performance chart
+        """
+        if df.empty:
+            # Return empty figure
+            fig = go.Figure()
+            fig.update_layout(
+                title="No data available",
+                xaxis_title="Model",
+                yaxis_title="Score"
+            )
+            return fig
+        # Prepare data for visualization
+        plot_df = df[['model_name', 'benchmark_name', 'tag', 'score']].copy()
+        plot_df.columns = ['Model', 'Benchmark', 'Tag', 'Score']
+        # Create chart based on type
+        if chart_type == "scatter":
+            fig = px.scatter(
+                plot_df,
+                x="Model",
+                y="Score",
+                color="Tag",
+                symbol="Benchmark",
+                size="Score",
+                hover_data=["Model", "Benchmark", "Score"],
+                color_discrete_map=self.tag_colors
+            )
+        else:  # Default to bar chart
+            fig = px.bar(
+                plot_df,
+                x="Model",
+                y="Score",
+                color="Tag",
+                barmode="group",
+                hover_data=["Model", "Benchmark", "Score"],
+                color_discrete_map=self.tag_colors
+            )
+        # Customize layout
+        fig.update_layout(
+            title="Model Performance Comparison",
+            xaxis_title="Model",
+            yaxis_title="Score",
+            legend_title="Tag",
+            font=dict(size=12)
+        )
+        return fig
+    def create_tag_distribution_chart(self, df):
+        """Create a chart showing distribution of models by tag.
+        Args:
+            df: Leaderboard DataFrame
+        Returns:
+            plotly.graph_objects.Figure: Tag distribution chart
+        """
+        if df.empty:
+            # Return empty figure
+            fig = go.Figure()
+            fig.update_layout(
+                title="No data available",
+                xaxis_title="Tag",
+                yaxis_title="Count"
+            )
+            return fig
+        # Count models by tag
+        tag_counts = df['tag'].value_counts().reset_index()
+        tag_counts.columns = ['Tag', 'Count']
+        # Create pie chart
+        fig = px.pie(
+            tag_counts,
+            names='Tag',
+            values='Count',
+            title='Model Distribution by Tag',
+            color='Tag',
+            color_discrete_map=self.tag_colors
+        )
+        # Customize layout
+        fig.update_layout(
+            font=dict(size=12)
+        )
+        return fig
+    def create_benchmark_comparison_chart(self, df):
+        """Create a chart comparing performance across benchmarks.
+        Args:
+            df: Leaderboard DataFrame
+        Returns:
+            plotly.graph_objects.Figure: Benchmark comparison chart
+        """
+        if df.empty:
+            # Return empty figure
+            fig = go.Figure()
+            fig.update_layout(
+                title="No data available",
+                xaxis_title="Benchmark",
+                yaxis_title="Average Score"
+            )
+            return fig
+        # Calculate average score by benchmark
+        benchmark_avg = df.groupby('benchmark_name')['score'].mean().reset_index()
+        benchmark_avg.columns = ['Benchmark', 'Average Score']
+        # Create bar chart
+        fig = px.bar(
+            benchmark_avg,
+            x='Benchmark',
+            y='Average Score',
+            title='Average Performance by Benchmark',
+            color='Benchmark'
+        )
+        # Customize layout
+        fig.update_layout(
+            xaxis_title="Benchmark",
+            yaxis_title="Average Score",
+            font=dict(size=12)
+        )
+        return fig
+# Leaderboard UI components
+def create_leaderboard_ui(leaderboard, db_manager):
+    """Create the leaderboard UI components.
+    Args:
+        leaderboard: Leaderboard instance
+        db_manager: Database manager instance
+    Returns:
+        gr.Blocks: Gradio Blocks component with leaderboard UI
+    """
+    with gr.Blocks() as leaderboard_ui:
+        gr.Markdown("# Dynamic Highscores Leaderboard")
+        with gr.Row():
+            with gr.Column(scale=1):
+                tag_filter = gr.Dropdown(
+                    choices=leaderboard.model_tags,
+                    value="All",
+                    label="Filter by Tag"
+                )
+                benchmark_filter = gr.Dropdown(
+                    choices=[("all", "All Benchmarks")],
+                    value="all",
+                    label="Filter by Benchmark"
+                )
+                refresh_button = gr.Button("Refresh Leaderboard")
+            with gr.Column(scale=2):
+                chart_type = gr.Radio(
+                    choices=["bar", "scatter"],
+                    value="bar",
+                    label="Chart Type"
+                )
+                view_type = gr.Radio(
+                    choices=["Table", "Chart", "Dashboard"],
+                    value="Table",
+                    label="View Type"
+                )
+        # Table view
+        leaderboard_table = gr.Dataframe(
+            headers=["Model", "Benchmark", "Tag", "Score", "Completed"],
+            label="Leaderboard",
+            visible=True
+        )
+        # Chart view
+        with gr.Row(visible=False) as chart_view:
+            performance_chart = gr.Plot(label="Performance Chart")
+        # Dashboard view
+        with gr.Row(visible=False) as dashboard_view:
+            with gr.Column(scale=2):
+                dashboard_performance_chart = gr.Plot(label="Performance Comparison")
+            with gr.Column(scale=1):
+                with gr.Row():
+                    tag_distribution_chart = gr.Plot(label="Model Distribution")
+                with gr.Row():
+                    benchmark_comparison_chart = gr.Plot(label="Benchmark Comparison")
+        # Event handlers
+        def refresh_benchmarks():
+            benchmarks = db_manager.get_benchmarks()
+            # Format for dropdown
+            choices = [("all", "All Benchmarks")]
+            choices.extend([(str(b["id"]), b["name"]) for b in benchmarks])
+            return gr.update(choices=choices)
+        def update_leaderboard(tag, benchmark_id, chart_type_val, view_type_val):
+            # Get leaderboard data
+            if benchmark_id == "all":
+                benchmark_id = None
+            else:
+                benchmark_id = int(benchmark_id)
+            df = leaderboard.get_leaderboard_data(tag=tag, benchmark_id=benchmark_id)
+            # Format for display
+            display_df = leaderboard.format_leaderboard_for_display(df)
+            # Create charts
+            perf_chart = leaderboard.create_performance_chart(df, chart_type=chart_type_val)
+            tag_chart = leaderboard.create_tag_distribution_chart(df)
+            benchmark_chart = leaderboard.create_benchmark_comparison_chart(df)
+            # Update visibility based on view type
+            table_visible = view_type_val == "Table"
+            chart_visible = view_type_val == "Chart"
+            dashboard_visible = view_type_val == "Dashboard"
+            return (
+                display_df,
+                perf_chart,
+                perf_chart,  # Same chart for both views
+                tag_chart,
+                benchmark_chart,
+                gr.update(visible=table_visible),
+                gr.update(visible=chart_visible),
+                gr.update(visible=dashboard_visible)
+            )
+        # Connect event handlers
+        refresh_button.click(
+            fn=lambda tag, benchmark, chart_t, view_t: update_leaderboard(tag, benchmark, chart_t, view_t),
+            inputs=[tag_filter, benchmark_filter, chart_type, view_type],
+            outputs=[
+                leaderboard_table,
+                performance_chart,
+                dashboard_performance_chart,
+                tag_distribution_chart,
+                benchmark_comparison_chart,
+                leaderboard_table,
+                chart_view,
+                dashboard_view
+            ]
+        )
+        view_type.change(
+            fn=lambda view_t: (
+                gr.update(visible=view_t == "Table"),
+                gr.update(visible=view_t == "Chart"),
+                gr.update(visible=view_t == "Dashboard")
+            ),
+            inputs=[view_type],
+            outputs=[leaderboard_table, chart_view, dashboard_view]
+        )
+        # Initialize on load
+        leaderboard_ui.load(
+            fn=refresh_benchmarks,
+            inputs=[],
+            outputs=[benchmark_filter]
+        )
+        leaderboard_ui.load(
+            fn=lambda: update_leaderboard("All", "all", "bar", "Table"),
+            inputs=[],
+            outputs=[
+                leaderboard_table,
+                performance_chart,
+                dashboard_performance_chart,
+                tag_distribution_chart,
+                benchmark_comparison_chart,
+                leaderboard_table,
+                chart_view,
+                dashboard_view
+            ]
+        )
+    return leaderboard_ui

requirements.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+"""
+Requirements file for Dynamic Highscores system.
+This file lists all the dependencies required to run the Dynamic Highscores application.
+"""
+# Core dependencies
+gradio>=4.4.0
+huggingface-hub>=0.27.1
+datasets>=2.14.5
+transformers>=4.35.2
+torch>=2.0.0
+pandas>=2.0.0
+numpy>=1.24.2
+plotly>=5.13.0
+# Scheduling and background tasks
+APScheduler>=3.10.1
+# Utilities
+tqdm>=4.65.0
+requests>=2.28.2
+python-dateutil>=2.8.2

sample_benchmarks.py ADDED Viewed

	@@ -0,0 +1,66 @@

+"""
+Sample benchmarks initialization for Dynamic Highscores system.
+This script adds sample benchmarks to the database to provide initial options for users.
+"""
+from database_schema import init_db
+def add_sample_benchmarks():
+    """Add sample benchmarks to the database."""
+    # Initialize database
+    db = init_db()
+    # Sample benchmarks to add
+    sample_benchmarks = [
+        {
+            "name": "MMLU (Massive Multitask Language Understanding)",
+            "dataset_id": "cais/mmlu",
+            "description": "A benchmark for measuring massive multitask language understanding across 57 tasks including elementary mathematics, US history, computer science, law, and more.",
+            "metrics": {"accuracy": 1.0, "consistency": 1.0}
+        },
+        {
+            "name": "HumanEval (Code Generation)",
+            "dataset_id": "openai/humaneval",
+            "description": "A benchmark for evaluating language models on code generation tasks. It consists of 164 programming problems with unit tests.",
+            "metrics": {"pass@1": 1.0, "functional_correctness": 1.0}
+        },
+        {
+            "name": "HellaSwag (Commonsense Reasoning)",
+            "dataset_id": "hellaswag",
+            "description": "A challenge dataset for evaluating commonsense natural language inference. It consists of multiple-choice questions about grounded situations.",
+            "metrics": {"accuracy": 1.0}
+        },
+        {
+            "name": "GSM8K (Grade School Math)",
+            "dataset_id": "gsm8k",
+            "description": "A dataset of 8.5K high quality grade school math word problems. These problems take between 2 and 8 steps to solve, and solutions primarily involve performing a sequence of elementary calculations using basic arithmetic operations.",
+            "metrics": {"accuracy": 1.0, "correct_steps": 1.0}
+        },
+        {
+            "name": "TruthfulQA",
+            "dataset_id": "truthful_qa",
+            "description": "A benchmark to measure whether a language model is truthful in generating answers to questions. The benchmark comprises 817 questions that span 38 categories, including health, law, finance and politics.",
+            "metrics": {"accuracy": 1.0, "truthfulness": 1.0}
+        }
+    ]
+    # Add each benchmark to the database
+    for benchmark in sample_benchmarks:
+        benchmark_id = db.add_benchmark(
+            name=benchmark["name"],
+            dataset_id=benchmark["dataset_id"],
+            description=benchmark["description"],
+            metrics=benchmark["metrics"]
+        )
+        print(f"Added benchmark '{benchmark['name']}' with ID: {benchmark_id}")
+    # Close database connection
+    db.close()
+    return len(sample_benchmarks)
+if __name__ == "__main__":
+    num_added = add_sample_benchmarks()
+    print(f"Added {num_added} sample benchmarks to the database.")

space.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+"""
+Deployment configuration for Dynamic Highscores on HuggingFace Spaces.
+This file configures the application for deployment on HuggingFace Spaces.
+"""
+sdk_version: 3.0.0
+app_file: app.py
+models:
+  - huggingface-hub
+  - transformers
+  - datasets
+  - torch
+  - gradio
+  - pandas
+  - plotly
+  - apscheduler
+  - tqdm
+  - requests
+  - python-dateutil
+  - numpy
+python_version: 3.10.12
+hf_oauth: true

test_app.py ADDED Viewed

	@@ -0,0 +1,237 @@

+"""
+Test script for Dynamic Highscores application.
+This script tests the key functionality of the Dynamic Highscores application
+to ensure everything works as expected before deployment.
+"""
+import os
+import unittest
+import tempfile
+import sqlite3
+from unittest.mock import MagicMock, patch
+# Import components to test
+from database_schema import DynamicHighscoresDB
+from auth import HuggingFaceAuth
+from benchmark_selection import BenchmarkSelector
+from evaluation_queue import EvaluationQueue
+from leaderboard import Leaderboard
+class TestDynamicHighscores(unittest.TestCase):
+    """Test cases for Dynamic Highscores application."""
+    def setUp(self):
+        """Set up test environment."""
+        # Create temporary database
+        self.db_fd, self.db_path = tempfile.mkstemp()
+        self.db = DynamicHighscoresDB(self.db_path)
+        # Mock auth manager
+        self.auth_manager = HuggingFaceAuth(self.db)
+        # Mock components
+        self.benchmark_selector = BenchmarkSelector(self.db, self.auth_manager)
+        self.evaluation_queue = EvaluationQueue(self.db, self.auth_manager)
+        self.leaderboard = Leaderboard(self.db)
+    def tearDown(self):
+        """Clean up test environment."""
+        os.close(self.db_fd)
+        os.unlink(self.db_path)
+    def test_database_schema(self):
+        """Test database schema creation."""
+        # Check if tables were created
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        # Get list of tables
+        cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
+        tables = cursor.fetchall()
+        table_names = [table[0] for table in tables]
+        # Check if all expected tables exist
+        expected_tables = ['users', 'benchmarks', 'models', 'evaluations', 'queue']
+        for table in expected_tables:
+            self.assertIn(table, table_names)
+        conn.close()
+    def test_user_management(self):
+        """Test user management functionality."""
+        # Add a test user
+        user_id = self.db.add_user("test_user", "test_hf_id", False)
+        self.assertIsNotNone(user_id)
+        # Add an admin user
+        admin_id = self.db.add_user("admin_user", "admin_hf_id", True)
+        self.assertIsNotNone(admin_id)
+        # Test submission limits
+        self.assertTrue(self.db.can_submit_today(user_id))
+        self.db.update_submission_date(user_id)
+        self.assertFalse(self.db.can_submit_today(user_id))
+        # Admin should always be able to submit
+        self.assertTrue(self.db.can_submit_today(admin_id))
+    def test_benchmark_management(self):
+        """Test benchmark management functionality."""
+        # Add a test benchmark
+        benchmark_id = self.db.add_benchmark(
+            name="Test Benchmark",
+            dataset_id="test/dataset",
+            description="Test description",
+            metrics={"accuracy": 1.0}
+        )
+        self.assertIsNotNone(benchmark_id)
+        # Get benchmarks
+        benchmarks = self.db.get_benchmarks()
+        self.assertEqual(len(benchmarks), 1)
+        self.assertEqual(benchmarks[0]["name"], "Test Benchmark")
+    def test_model_management(self):
+        """Test model management functionality."""
+        # Add a test user
+        user_id = self.db.add_user("test_user", "test_hf_id", False)
+        # Add a test model
+        model_id = self.db.add_model(
+            name="Test Model",
+            hf_model_id="test/model",
+            user_id=user_id,
+            tag="Reasoning",
+            parameters="7B",
+            description="Test model description"
+        )
+        self.assertIsNotNone(model_id)
+        # Get models
+        models = self.db.get_models()
+        self.assertEqual(len(models), 1)
+        self.assertEqual(models[0]["name"], "Test Model")
+        # Get models by tag
+        models = self.db.get_models(tag="Reasoning")
+        self.assertEqual(len(models), 1)
+        self.assertEqual(models[0]["tag"], "Reasoning")
+    def test_evaluation_management(self):
+        """Test evaluation management functionality."""
+        # Add a test user
+        user_id = self.db.add_user("test_user", "test_hf_id", False)
+        # Add a test model
+        model_id = self.db.add_model(
+            name="Test Model",
+            hf_model_id="test/model",
+            user_id=user_id,
+            tag="Reasoning"
+        )
+        # Add a test benchmark
+        benchmark_id = self.db.add_benchmark(
+            name="Test Benchmark",
+            dataset_id="test/dataset"
+        )
+        # Add a test evaluation
+        evaluation_id = self.db.add_evaluation(
+            model_id=model_id,
+            benchmark_id=benchmark_id
+        )
+        self.assertIsNotNone(evaluation_id)
+        # Update evaluation status
+        self.db.update_evaluation_status(
+            evaluation_id=evaluation_id,
+            status="running"
+        )
+        # Get next in queue
+        next_eval = self.db.get_next_in_queue()
+        self.assertIsNotNone(next_eval)
+        self.assertEqual(next_eval["evaluation_id"], evaluation_id)
+        # Complete evaluation
+        self.db.update_evaluation_status(
+            evaluation_id=evaluation_id,
+            status="completed",
+            results={"accuracy": 0.85},
+            score=85.0
+        )
+        # Get evaluation results
+        results = self.db.get_evaluation_results()
+        self.assertEqual(len(results), 1)
+        self.assertEqual(results[0]["score"], 85.0)
+    def test_leaderboard(self):
+        """Test leaderboard functionality."""
+        # Add test data
+        user_id = self.db.add_user("test_user", "test_hf_id", False)
+        # Add models with different tags
+        model1_id = self.db.add_model(
+            name="Model 1",
+            hf_model_id="test/model1",
+            user_id=user_id,
+            tag="Reasoning"
+        )
+        model2_id = self.db.add_model(
+            name="Model 2",
+            hf_model_id="test/model2",
+            user_id=user_id,
+            tag="Coding"
+        )
+        # Add a benchmark
+        benchmark_id = self.db.add_benchmark(
+            name="Test Benchmark",
+            dataset_id="test/dataset"
+        )
+        # Add evaluations
+        eval1_id = self.db.add_evaluation(
+            model_id=model1_id,
+            benchmark_id=benchmark_id
+        )
+        eval2_id = self.db.add_evaluation(
+            model_id=model2_id,
+            benchmark_id=benchmark_id
+        )
+        # Complete evaluations
+        self.db.update_evaluation_status(
+            evaluation_id=eval1_id,
+            status="completed",
+            results={"accuracy": 0.9},
+            score=90.0
+        )
+        self.db.update_evaluation_status(
+            evaluation_id=eval2_id,
+            status="completed",
+            results={"accuracy": 0.8},
+            score=80.0
+        )
+        # Get leaderboard data
+        df = self.leaderboard.get_leaderboard_data()
+        self.assertEqual(len(df), 2)
+        # Test filtering by tag
+        df_reasoning = self.leaderboard.get_leaderboard_data(tag="Reasoning")
+        self.assertEqual(len(df_reasoning), 1)
+        self.assertEqual(df_reasoning.iloc[0]["score"], 90.0)
+        df_coding = self.leaderboard.get_leaderboard_data(tag="Coding")
+        self.assertEqual(len(df_coding), 1)
+        self.assertEqual(df_coding.iloc[0]["score"], 80.0)
+if __name__ == "__main__":
+    unittest.main()

todo.md ADDED Viewed

	@@ -0,0 +1,48 @@

+# Dynamic Highscores - Todo List
+## Analysis and Planning
+- [x] Extract and analyze uploaded framework files
+- [x] Examine leaderboard component structure and functionality
+- [x] Examine dashboard component structure and functionality
+- [x] Analyze requirements.txt files for dependencies
+## Database Schema Design
+- [x] Design schema for user authentication and tracking
+- [x] Design schema for benchmark datasets
+- [x] Design schema for model submissions and evaluations
+- [x] Design schema for tagging system (Merge, Agent, Reasoning, Coding, etc.)
+- [x] Design schema for daily submission limits
+## User Authentication System
+- [x] Implement HuggingFace login integration
+- [x] Create user profile management
+- [x] Implement special privileges for admin account
+## Benchmark Selection Interface
+- [x] Create interface for browsing HuggingFace datasets
+- [x] Implement dataset loading functionality
+- [x] Create dataset preview and selection UI
+## Model Evaluation Queue System
+- [x] Implement CPU-only evaluation system
+- [x] Create queue management for benchmark submissions
+- [x] Implement daily submission limit (1 per day per user)
+- [x] Add admin override for submission limits
+## Leaderboard with Filtering
+- [x] Implement unified leaderboard for all models
+- [x] Add tag-based filtering (Merge, Agent, Reasoning, Coding)
+- [x] Implement sorting and searching functionality
+- [x] Create visualization components for benchmark results
+## Integration
+- [x] Combine dashboard and leaderboard components
+- [x] Create unified UI with consistent styling
+- [x] Implement navigation between different sections
+- [x] Ensure proper data flow between components
+## Testing and Deployment
+- [x] Test user authentication flow
+- [x] Test benchmark selection and submission
+- [x] Test leaderboard filtering and visualization
+- [x] Prepare for deployment on HuggingFace Spaces