SLM-RAG-Arena / utils /data_loader.py
oliver-aizip's picture
update data loading (#1)
f85a3ff verified
raw
history blame
6.87 kB
import os
import json
import pandas as pd
import random
import re
from .context_processor import process_highlights
# Global data store - loaded once at import time
_ARENA_DATA = None
def load_arena_data():
"""
Loads the arena data from the arena_df.csv file in the utils directory.
Returns the data in a format compatible with the application.
"""
global _ARENA_DATA
# If data is already loaded, return it
if _ARENA_DATA is not None:
return _ARENA_DATA
try:
# Define the path to the CSV file
csv_path = os.path.join('utils', 'arena_df.csv')
# Read the CSV file
df = pd.read_csv(csv_path)
print(f"Loaded arena data with {len(df)} examples")
# Store the data globally
_ARENA_DATA = df
return df
except Exception as e:
print(f"Error loading arena data: {e}")
# Return an empty DataFrame if file can't be loaded
return pd.DataFrame()
def create_dummy_example():
"""Creates a dummy example if no data is loaded"""
return {
"question": "Could not load questions from the dataset. Please check the data file.",
"processed_context_desc": "Error: Data not available",
"contexts": ["No context available"],
"full_context": "Error loading context data.",
"Answerable": False,
"insufficient": True
}
def get_random_example():
"""
Selects a random example from the loaded arena data.
Returns the example data in a format compatible with the application.
"""
# Get the globally stored data - won't reload from disk
df = load_arena_data()
if df.empty:
# If no data is loaded, return a dummy example
return create_dummy_example()
# Select a random row
example = df.sample(1).iloc[0]
# Process the example data
processed_example = {
"question": example['question'],
"processed_context_desc": example.get('processed_context_desc', ''),
"Answerable": example.get('Answerable', True), # Default to True unless specified otherwise
"insufficient": example.get('insufficient', False),
"insufficient_reason": example.get('insufficient_reason', '')
}
# Process contexts - for full context
try:
contexts_raw = example['contexts']
if isinstance(contexts_raw, str):
contexts = json.loads(contexts_raw)
# Store full contexts as individual items
full_contexts = []
if isinstance(contexts, list):
for i, chunk in enumerate(contexts):
if isinstance(chunk, dict) and 'content' in chunk:
full_contexts.append({
'chunk_num': i + 1,
'content': chunk.get('content', '')
})
processed_example["full_contexts"] = full_contexts
else:
processed_example["full_contexts"] = []
except Exception as e:
print(f"Error processing contexts: {e}")
processed_example["full_contexts"] = []
# Process highlighted contexts for display
contexts_highlighted = []
try:
# Check if contexts_highlighted exists
if 'contexts_highlighted' in example and example['contexts_highlighted']:
highlighted_contexts = []
if isinstance(example['contexts_highlighted'], str):
try:
# Try direct JSON parsing first
raw_str = example['contexts_highlighted']
# First, manually parse the highlighted contexts using regex
# This is a more robust approach for our specific format
type_pattern = r'"type":\s*"(primary|secondary)"'
content_pattern = r'"abbreviatedContent":\s*"([^"]*)"|"abbreviatedContent":\s*"([^"]*)'
types = re.findall(type_pattern, raw_str)
# Handle both regular quotes and escaped quotes in content
raw_contents = re.findall(content_pattern, raw_str)
# Extract contents from tuple matches (the regex has capture groups)
contents = []
for match in raw_contents:
# Get the non-empty string from the tuple
content = next((s for s in match if s), "")
contents.append(content)
# Create the highlighted contexts from extracted data
for i, (ctx_type, content) in enumerate(zip(types, contents)):
highlighted_contexts.append({
'type': ctx_type,
'abbreviatedContent': content
})
except Exception as e:
print(f"Error extracting contexts with regex: {e}")
else:
# Already an object, not a string
highlighted_contexts = example['contexts_highlighted']
# Process each context item
for i, item in enumerate(highlighted_contexts):
if isinstance(item, dict):
ctx_type = item.get('type', 'secondary')
content = item.get('abbreviatedContent', '')
# Process highlights using the standard format
content = process_highlights(content)
contexts_highlighted.append({
'chunk_num': i + 1,
'content': content,
'is_primary': ctx_type == 'primary'
})
except Exception as e:
print(f"Error processing highlighted contexts: {e}")
# If we couldn't process the highlighted contexts, fall back to the full contexts
if not contexts_highlighted and processed_example["full_contexts"]:
for i, ctx in enumerate(processed_example["full_contexts"]):
contexts_highlighted.append({
'chunk_num': i + 1,
'content': ctx.get('content', ''),
'is_primary': False
})
processed_example["contexts"] = contexts_highlighted
return processed_example
def get_random_example_and_models(model_names):
"""
Selects a random example from the arena data and assigns two distinct
random models to positions A and B.
"""
example = get_random_example()
# Choose two different models from the model list
model_a_name, model_b_name = random.sample(model_names, 2)
return example, model_a_name, model_b_name