Spaces:
Running
on
Zero
Running
on
Zero
File size: 6,873 Bytes
8a142a6 f85a3ff 8a142a6 f85a3ff 8a142a6 f85a3ff 8a142a6 f85a3ff 8a142a6 f85a3ff 8a142a6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
import os
import json
import pandas as pd
import random
import re
from .context_processor import process_highlights
# Global data store - loaded once at import time
_ARENA_DATA = None
def load_arena_data():
"""
Loads the arena data from the arena_df.csv file in the utils directory.
Returns the data in a format compatible with the application.
"""
global _ARENA_DATA
# If data is already loaded, return it
if _ARENA_DATA is not None:
return _ARENA_DATA
try:
# Define the path to the CSV file
csv_path = os.path.join('utils', 'arena_df.csv')
# Read the CSV file
df = pd.read_csv(csv_path)
print(f"Loaded arena data with {len(df)} examples")
# Store the data globally
_ARENA_DATA = df
return df
except Exception as e:
print(f"Error loading arena data: {e}")
# Return an empty DataFrame if file can't be loaded
return pd.DataFrame()
def create_dummy_example():
"""Creates a dummy example if no data is loaded"""
return {
"question": "Could not load questions from the dataset. Please check the data file.",
"processed_context_desc": "Error: Data not available",
"contexts": ["No context available"],
"full_context": "Error loading context data.",
"Answerable": False,
"insufficient": True
}
def get_random_example():
"""
Selects a random example from the loaded arena data.
Returns the example data in a format compatible with the application.
"""
# Get the globally stored data - won't reload from disk
df = load_arena_data()
if df.empty:
# If no data is loaded, return a dummy example
return create_dummy_example()
# Select a random row
example = df.sample(1).iloc[0]
# Process the example data
processed_example = {
"question": example['question'],
"processed_context_desc": example.get('processed_context_desc', ''),
"Answerable": example.get('Answerable', True), # Default to True unless specified otherwise
"insufficient": example.get('insufficient', False),
"insufficient_reason": example.get('insufficient_reason', '')
}
# Process contexts - for full context
try:
contexts_raw = example['contexts']
if isinstance(contexts_raw, str):
contexts = json.loads(contexts_raw)
# Store full contexts as individual items
full_contexts = []
if isinstance(contexts, list):
for i, chunk in enumerate(contexts):
if isinstance(chunk, dict) and 'content' in chunk:
full_contexts.append({
'chunk_num': i + 1,
'content': chunk.get('content', '')
})
processed_example["full_contexts"] = full_contexts
else:
processed_example["full_contexts"] = []
except Exception as e:
print(f"Error processing contexts: {e}")
processed_example["full_contexts"] = []
# Process highlighted contexts for display
contexts_highlighted = []
try:
# Check if contexts_highlighted exists
if 'contexts_highlighted' in example and example['contexts_highlighted']:
highlighted_contexts = []
if isinstance(example['contexts_highlighted'], str):
try:
# Try direct JSON parsing first
raw_str = example['contexts_highlighted']
# First, manually parse the highlighted contexts using regex
# This is a more robust approach for our specific format
type_pattern = r'"type":\s*"(primary|secondary)"'
content_pattern = r'"abbreviatedContent":\s*"([^"]*)"|"abbreviatedContent":\s*"([^"]*)'
types = re.findall(type_pattern, raw_str)
# Handle both regular quotes and escaped quotes in content
raw_contents = re.findall(content_pattern, raw_str)
# Extract contents from tuple matches (the regex has capture groups)
contents = []
for match in raw_contents:
# Get the non-empty string from the tuple
content = next((s for s in match if s), "")
contents.append(content)
# Create the highlighted contexts from extracted data
for i, (ctx_type, content) in enumerate(zip(types, contents)):
highlighted_contexts.append({
'type': ctx_type,
'abbreviatedContent': content
})
except Exception as e:
print(f"Error extracting contexts with regex: {e}")
else:
# Already an object, not a string
highlighted_contexts = example['contexts_highlighted']
# Process each context item
for i, item in enumerate(highlighted_contexts):
if isinstance(item, dict):
ctx_type = item.get('type', 'secondary')
content = item.get('abbreviatedContent', '')
# Process highlights using the standard format
content = process_highlights(content)
contexts_highlighted.append({
'chunk_num': i + 1,
'content': content,
'is_primary': ctx_type == 'primary'
})
except Exception as e:
print(f"Error processing highlighted contexts: {e}")
# If we couldn't process the highlighted contexts, fall back to the full contexts
if not contexts_highlighted and processed_example["full_contexts"]:
for i, ctx in enumerate(processed_example["full_contexts"]):
contexts_highlighted.append({
'chunk_num': i + 1,
'content': ctx.get('content', ''),
'is_primary': False
})
processed_example["contexts"] = contexts_highlighted
return processed_example
def get_random_example_and_models(model_names):
"""
Selects a random example from the arena data and assigns two distinct
random models to positions A and B.
"""
example = get_random_example()
# Choose two different models from the model list
model_a_name, model_b_name = random.sample(model_names, 2)
return example, model_a_name, model_b_name |