File size: 6,873 Bytes
8a142a6
 
 
 
 
 
 
f85a3ff
 
 
8a142a6
 
 
 
 
f85a3ff
 
 
 
 
 
8a142a6
 
 
 
 
 
 
f85a3ff
 
 
8a142a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f85a3ff
8a142a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f85a3ff
8a142a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import os
import json
import pandas as pd
import random
import re
from .context_processor import process_highlights

# Global data store - loaded once at import time
_ARENA_DATA = None

def load_arena_data():
    """
    Loads the arena data from the arena_df.csv file in the utils directory.
    Returns the data in a format compatible with the application.
    """
    global _ARENA_DATA
    
    # If data is already loaded, return it
    if _ARENA_DATA is not None:
        return _ARENA_DATA
    
    try:
        # Define the path to the CSV file
        csv_path = os.path.join('utils', 'arena_df.csv')
        
        # Read the CSV file
        df = pd.read_csv(csv_path)
        print(f"Loaded arena data with {len(df)} examples")
        
        # Store the data globally
        _ARENA_DATA = df
        return df
    except Exception as e:
        print(f"Error loading arena data: {e}")
        # Return an empty DataFrame if file can't be loaded
        return pd.DataFrame()

def create_dummy_example():
    """Creates a dummy example if no data is loaded"""
    return {
        "question": "Could not load questions from the dataset. Please check the data file.",
        "processed_context_desc": "Error: Data not available",
        "contexts": ["No context available"],
        "full_context": "Error loading context data.",
        "Answerable": False,
        "insufficient": True
    }

def get_random_example():
    """
    Selects a random example from the loaded arena data.
    Returns the example data in a format compatible with the application.
    """
    # Get the globally stored data - won't reload from disk
    df = load_arena_data()
    
    if df.empty:
        # If no data is loaded, return a dummy example
        return create_dummy_example()
    
    # Select a random row
    example = df.sample(1).iloc[0]
    
    # Process the example data
    processed_example = {
        "question": example['question'],
        "processed_context_desc": example.get('processed_context_desc', ''),
        "Answerable": example.get('Answerable', True),  # Default to True unless specified otherwise
        "insufficient": example.get('insufficient', False),
        "insufficient_reason": example.get('insufficient_reason', '')
    }
    
    # Process contexts - for full context
    try:
        contexts_raw = example['contexts']
        if isinstance(contexts_raw, str):
            contexts = json.loads(contexts_raw)
            # Store full contexts as individual items
            full_contexts = []
            if isinstance(contexts, list):
                for i, chunk in enumerate(contexts):
                    if isinstance(chunk, dict) and 'content' in chunk:
                        full_contexts.append({
                            'chunk_num': i + 1,
                            'content': chunk.get('content', '')
                        })
            processed_example["full_contexts"] = full_contexts
        else:
            processed_example["full_contexts"] = []
    except Exception as e:
        print(f"Error processing contexts: {e}")
        processed_example["full_contexts"] = []
    
    # Process highlighted contexts for display
    contexts_highlighted = []
    
    try:
        # Check if contexts_highlighted exists
        if 'contexts_highlighted' in example and example['contexts_highlighted']:
            highlighted_contexts = []
            
            if isinstance(example['contexts_highlighted'], str):
                try:
                    # Try direct JSON parsing first
                    raw_str = example['contexts_highlighted']
                    
                    # First, manually parse the highlighted contexts using regex
                    # This is a more robust approach for our specific format
                    type_pattern = r'"type":\s*"(primary|secondary)"'
                    content_pattern = r'"abbreviatedContent":\s*"([^"]*)"|"abbreviatedContent":\s*"([^"]*)'
                    
                    types = re.findall(type_pattern, raw_str)
                    # Handle both regular quotes and escaped quotes in content
                    raw_contents = re.findall(content_pattern, raw_str)
                    
                    # Extract contents from tuple matches (the regex has capture groups)
                    contents = []
                    for match in raw_contents:
                        # Get the non-empty string from the tuple
                        content = next((s for s in match if s), "")
                        contents.append(content)
                    
                    # Create the highlighted contexts from extracted data
                    for i, (ctx_type, content) in enumerate(zip(types, contents)):
                        highlighted_contexts.append({
                            'type': ctx_type,
                            'abbreviatedContent': content
                        })
                        
                except Exception as e:
                    print(f"Error extracting contexts with regex: {e}")
            else:
                # Already an object, not a string
                highlighted_contexts = example['contexts_highlighted']
            
            # Process each context item
            for i, item in enumerate(highlighted_contexts):
                if isinstance(item, dict):
                    ctx_type = item.get('type', 'secondary')
                    content = item.get('abbreviatedContent', '')
                    
                    # Process highlights using the standard format
                    content = process_highlights(content)
                    
                    contexts_highlighted.append({
                        'chunk_num': i + 1,
                        'content': content,
                        'is_primary': ctx_type == 'primary'
                    })
    except Exception as e:
        print(f"Error processing highlighted contexts: {e}")
    
    # If we couldn't process the highlighted contexts, fall back to the full contexts
    if not contexts_highlighted and processed_example["full_contexts"]:
        for i, ctx in enumerate(processed_example["full_contexts"]):
            contexts_highlighted.append({
                'chunk_num': i + 1,
                'content': ctx.get('content', ''),
                'is_primary': False
            })
    
    processed_example["contexts"] = contexts_highlighted
    
    return processed_example

def get_random_example_and_models(model_names):
    """
    Selects a random example from the arena data and assigns two distinct
    random models to positions A and B.
    """
    example = get_random_example()
    # Choose two different models from the model list
    model_a_name, model_b_name = random.sample(model_names, 2)
    return example, model_a_name, model_b_name