import os import json import pandas as pd import random import re from .context_processor import process_highlights # Global data store - loaded once at import time _ARENA_DATA = None def load_arena_data(): """ Loads the arena data from the arena_df.csv file in the utils directory. Returns the data in a format compatible with the application. """ global _ARENA_DATA # If data is already loaded, return it if _ARENA_DATA is not None: return _ARENA_DATA try: # Define the path to the CSV file csv_path = os.path.join('utils', 'arena_df.csv') # Read the CSV file df = pd.read_csv(csv_path) print(f"Loaded arena data with {len(df)} examples") # Store the data globally _ARENA_DATA = df return df except Exception as e: print(f"Error loading arena data: {e}") # Return an empty DataFrame if file can't be loaded return pd.DataFrame() def create_dummy_example(): """Creates a dummy example if no data is loaded""" return { "question": "Could not load questions from the dataset. Please check the data file.", "processed_context_desc": "Error: Data not available", "contexts": ["No context available"], "full_context": "Error loading context data.", "Answerable": False, "insufficient": True } def get_random_example(): """ Selects a random example from the loaded arena data. Returns the example data in a format compatible with the application. """ # Get the globally stored data - won't reload from disk df = load_arena_data() if df.empty: # If no data is loaded, return a dummy example return create_dummy_example() # Select a random row example = df.sample(1).iloc[0] # Process the example data processed_example = { "question": example['question'], "processed_context_desc": example.get('processed_context_desc', ''), "Answerable": example.get('Answerable', True), # Default to True unless specified otherwise "insufficient": example.get('insufficient', False), "insufficient_reason": example.get('insufficient_reason', '') } # Process contexts - for full context try: contexts_raw = example['contexts'] if isinstance(contexts_raw, str): contexts = json.loads(contexts_raw) # Store full contexts as individual items full_contexts = [] if isinstance(contexts, list): for i, chunk in enumerate(contexts): if isinstance(chunk, dict) and 'content' in chunk: full_contexts.append({ 'chunk_num': i + 1, 'content': chunk.get('content', '') }) processed_example["full_contexts"] = full_contexts else: processed_example["full_contexts"] = [] except Exception as e: print(f"Error processing contexts: {e}") processed_example["full_contexts"] = [] # Process highlighted contexts for display contexts_highlighted = [] try: # Check if contexts_highlighted exists if 'contexts_highlighted' in example and example['contexts_highlighted']: highlighted_contexts = [] if isinstance(example['contexts_highlighted'], str): try: # Try direct JSON parsing first raw_str = example['contexts_highlighted'] # First, manually parse the highlighted contexts using regex # This is a more robust approach for our specific format type_pattern = r'"type":\s*"(primary|secondary)"' content_pattern = r'"abbreviatedContent":\s*"([^"]*)"|"abbreviatedContent":\s*"([^"]*)' types = re.findall(type_pattern, raw_str) # Handle both regular quotes and escaped quotes in content raw_contents = re.findall(content_pattern, raw_str) # Extract contents from tuple matches (the regex has capture groups) contents = [] for match in raw_contents: # Get the non-empty string from the tuple content = next((s for s in match if s), "") contents.append(content) # Create the highlighted contexts from extracted data for i, (ctx_type, content) in enumerate(zip(types, contents)): highlighted_contexts.append({ 'type': ctx_type, 'abbreviatedContent': content }) except Exception as e: print(f"Error extracting contexts with regex: {e}") else: # Already an object, not a string highlighted_contexts = example['contexts_highlighted'] # Process each context item for i, item in enumerate(highlighted_contexts): if isinstance(item, dict): ctx_type = item.get('type', 'secondary') content = item.get('abbreviatedContent', '') # Process highlights using the standard format content = process_highlights(content) contexts_highlighted.append({ 'chunk_num': i + 1, 'content': content, 'is_primary': ctx_type == 'primary' }) except Exception as e: print(f"Error processing highlighted contexts: {e}") # If we couldn't process the highlighted contexts, fall back to the full contexts if not contexts_highlighted and processed_example["full_contexts"]: for i, ctx in enumerate(processed_example["full_contexts"]): contexts_highlighted.append({ 'chunk_num': i + 1, 'content': ctx.get('content', ''), 'is_primary': False }) processed_example["contexts"] = contexts_highlighted return processed_example def get_random_example_and_models(model_names): """ Selects a random example from the arena data and assigns two distinct random models to positions A and B. """ example = get_random_example() # Choose two different models from the model list model_a_name, model_b_name = random.sample(model_names, 2) return example, model_a_name, model_b_name