SLM-RAG-Arena

Running on Zero

File size: 6,873 Bytes

import os
import json
import pandas as pd
import random
import re
from .context_processor import process_highlights

# Global data store - loaded once at import time
_ARENA_DATA = None

def load_arena_data():
    """
    Loads the arena data from the arena_df.csv file in the utils directory.
    Returns the data in a format compatible with the application.
    """
    global _ARENA_DATA
    
    # If data is already loaded, return it
    if _ARENA_DATA is not None:
        return _ARENA_DATA
    
    try:
        # Define the path to the CSV file
        csv_path = os.path.join('utils', 'arena_df.csv')
        
        # Read the CSV file
        df = pd.read_csv(csv_path)
        print(f"Loaded arena data with {len(df)} examples")
        
        # Store the data globally
        _ARENA_DATA = df
        return df
    except Exception as e:
        print(f"Error loading arena data: {e}")
        # Return an empty DataFrame if file can't be loaded
        return pd.DataFrame()

def create_dummy_example():
    """Creates a dummy example if no data is loaded"""
    return {
        "question": "Could not load questions from the dataset. Please check the data file.",
        "processed_context_desc": "Error: Data not available",
        "contexts": ["No context available"],
        "full_context": "Error loading context data.",
        "Answerable": False,
        "insufficient": True
    }

def get_random_example():
    """
    Selects a random example from the loaded arena data.
    Returns the example data in a format compatible with the application.
    """
    # Get the globally stored data - won't reload from disk
    df = load_arena_data()
    
    if df.empty:
        # If no data is loaded, return a dummy example
        return create_dummy_example()
    
    # Select a random row
    example = df.sample(1).iloc[0]
    
    # Process the example data
    processed_example = {
        "question": example['question'],
        "processed_context_desc": example.get('processed_context_desc', ''),
        "Answerable": example.get('Answerable', True),  # Default to True unless specified otherwise
        "insufficient": example.get('insufficient', False),
        "insufficient_reason": example.get('insufficient_reason', '')
    }
    
    # Process contexts - for full context
    try:
        contexts_raw = example['contexts']
        if isinstance(contexts_raw, str):
            contexts = json.loads(contexts_raw)
            # Store full contexts as individual items
            full_contexts = []
            if isinstance(contexts, list):
                for i, chunk in enumerate(contexts):
                    if isinstance(chunk, dict) and 'content' in chunk:
                        full_contexts.append({
                            'chunk_num': i + 1,
                            'content': chunk.get('content', '')
                        })
            processed_example["full_contexts"] = full_contexts
        else:
            processed_example["full_contexts"] = []
    except Exception as e:
        print(f"Error processing contexts: {e}")
        processed_example["full_contexts"] = []
    
    # Process highlighted contexts for display
    contexts_highlighted = []
    
    try:
        # Check if contexts_highlighted exists
        if 'contexts_highlighted' in example and example['contexts_highlighted']:
            highlighted_contexts = []
            
            if isinstance(example['contexts_highlighted'], str):
                try:
                    # Try direct JSON parsing first
                    raw_str = example['contexts_highlighted']
                    
                    # First, manually parse the highlighted contexts using regex
                    # This is a more robust approach for our specific format
                    type_pattern = r'"type":\s*"(primary|secondary)"'
                    content_pattern = r'"abbreviatedContent":\s*"([^"]*)"|"abbreviatedContent":\s*"([^"]*)'
                    
                    types = re.findall(type_pattern, raw_str)
                    # Handle both regular quotes and escaped quotes in content
                    raw_contents = re.findall(content_pattern, raw_str)
                    
                    # Extract contents from tuple matches (the regex has capture groups)
                    contents = []
                    for match in raw_contents:
                        # Get the non-empty string from the tuple
                        content = next((s for s in match if s), "")
                        contents.append(content)
                    
                    # Create the highlighted contexts from extracted data
                    for i, (ctx_type, content) in enumerate(zip(types, contents)):
                        highlighted_contexts.append({
                            'type': ctx_type,
                            'abbreviatedContent': content
                        })
                        
                except Exception as e:
                    print(f"Error extracting contexts with regex: {e}")
            else:
                # Already an object, not a string
                highlighted_contexts = example['contexts_highlighted']
            
            # Process each context item
            for i, item in enumerate(highlighted_contexts):
                if isinstance(item, dict):
                    ctx_type = item.get('type', 'secondary')
                    content = item.get('abbreviatedContent', '')
                    
                    # Process highlights using the standard format
                    content = process_highlights(content)
                    
                    contexts_highlighted.append({
                        'chunk_num': i + 1,
                        'content': content,
                        'is_primary': ctx_type == 'primary'
                    })
    except Exception as e:
        print(f"Error processing highlighted contexts: {e}")
    
    # If we couldn't process the highlighted contexts, fall back to the full contexts
    if not contexts_highlighted and processed_example["full_contexts"]:
        for i, ctx in enumerate(processed_example["full_contexts"]):
            contexts_highlighted.append({
                'chunk_num': i + 1,
                'content': ctx.get('content', ''),
                'is_primary': False
            })
    
    processed_example["contexts"] = contexts_highlighted
    
    return processed_example

def get_random_example_and_models(model_names):
    """
    Selects a random example from the arena data and assigns two distinct
    random models to positions A and B.
    """
    example = get_random_example()
    # Choose two different models from the model list
    model_a_name, model_b_name = random.sample(model_names, 2)
    return example, model_a_name, model_b_name