Spaces:
Restarting
on
Zero
Restarting
on
Zero
File size: 6,590 Bytes
8a142a6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
import os
import json
import pandas as pd
import random
import re
from .context_processor import process_highlights
def load_arena_data():
"""
Loads the arena data from the arena_df.csv file in the utils directory.
Returns the data in a format compatible with the application.
"""
try:
# Define the path to the CSV file
csv_path = os.path.join('utils', 'arena_df.csv')
# Read the CSV file
df = pd.read_csv(csv_path)
print(f"Loaded arena data with {len(df)} examples")
return df
except Exception as e:
print(f"Error loading arena data: {e}")
# Return an empty DataFrame if file can't be loaded
return pd.DataFrame()
def create_dummy_example():
"""Creates a dummy example if no data is loaded"""
return {
"question": "Could not load questions from the dataset. Please check the data file.",
"processed_context_desc": "Error: Data not available",
"contexts": ["No context available"],
"full_context": "Error loading context data.",
"Answerable": False,
"insufficient": True
}
def get_random_example():
"""
Selects a random example from the loaded arena data.
Returns the example data in a format compatible with the application.
"""
# Load the arena data
df = load_arena_data()
if df.empty:
# If no data is loaded, return a dummy example
return create_dummy_example()
# Select a random row
example = df.sample(1).iloc[0]
# Process the example data
processed_example = {
"question": example['question'],
"processed_context_desc": example.get('processed_context_desc', ''),
"Answerable": example.get('Answerable', True), # Default to True unless specified otherwise
"insufficient": example.get('insufficient', False),
"insufficient_reason": example.get('insufficient_reason', '')
}
# Process contexts - for full context
try:
contexts_raw = example['contexts']
if isinstance(contexts_raw, str):
contexts = json.loads(contexts_raw)
# Store full contexts as individual items
full_contexts = []
if isinstance(contexts, list):
for i, chunk in enumerate(contexts):
if isinstance(chunk, dict) and 'content' in chunk:
full_contexts.append({
'chunk_num': i + 1,
'content': chunk.get('content', '')
})
processed_example["full_contexts"] = full_contexts
else:
processed_example["full_contexts"] = []
except Exception as e:
print(f"Error processing contexts: {e}")
processed_example["full_contexts"] = []
# Process highlighted contexts for display
contexts_highlighted = []
try:
# Check if contexts_highlighted exists
if 'contexts_highlighted' in example and example['contexts_highlighted']:
highlighted_contexts = []
if isinstance(example['contexts_highlighted'], str):
try:
# Try direct parsing, assuming it's a valid JSON array
raw_str = example['contexts_highlighted']
# First, manually parse the highlighted contexts using regex
# This is a more robust approach for our specific format
type_pattern = r'"type":\s*"(primary|secondary)"'
content_pattern = r'"abbreviatedContent":\s*"([^"]*)"|"abbreviatedContent":\s*"([^"]*)'
types = re.findall(type_pattern, raw_str)
# Handle both regular quotes and escaped quotes in content
raw_contents = re.findall(content_pattern, raw_str)
# Extract contents from tuple matches (the regex has capture groups)
contents = []
for match in raw_contents:
# Get the non-empty string from the tuple
content = next((s for s in match if s), "")
contents.append(content)
# Create the highlighted contexts from extracted data
for i, (ctx_type, content) in enumerate(zip(types, contents)):
highlighted_contexts.append({
'type': ctx_type,
'abbreviatedContent': content
})
except Exception as e:
print(f"Error extracting contexts with regex: {e}")
else:
# Already an object, not a string
highlighted_contexts = example['contexts_highlighted']
# Process each context item
for i, item in enumerate(highlighted_contexts):
if isinstance(item, dict):
ctx_type = item.get('type', 'secondary')
content = item.get('abbreviatedContent', '')
# Process highlights using the standard format
content = process_highlights(content)
contexts_highlighted.append({
'chunk_num': i + 1,
'content': content,
'is_primary': ctx_type == 'primary'
})
except Exception as e:
print(f"Error processing highlighted contexts: {e}")
# If we couldn't process the highlighted contexts, fall back to the full contexts
if not contexts_highlighted and processed_example["full_contexts"]:
for i, ctx in enumerate(processed_example["full_contexts"]):
contexts_highlighted.append({
'chunk_num': i + 1,
'content': ctx.get('content', ''),
'is_primary': False
})
processed_example["contexts"] = contexts_highlighted
return processed_example
def get_random_example_and_models(model_names):
"""
Selects a random example from the arena data and assigns two distinct
random models to positions A and B.
"""
example = get_random_example()
# Choose two different models from the model list
model_a_name, model_b_name = random.sample(model_names, 2)
return example, model_a_name, model_b_name |