SLM-RAG-Arena

Running on Zero

File size: 8,547 Bytes

8a142a6

import re
import json

def debug_text(text, label="Text"):
    """Helper function to debug text processing issues"""
    print(f"\n--- DEBUG {label} ---")
    print(f"Length: {len(text)}")
    print(f"First 100 chars: {text[:100]}")
    print(f"Contains highlight_start: {'[[highlight_start]]' in text}")
    print(f"Contains start_highlight: {'[[start_highlight]]' in text}")
    print("-------------------------\n")

def clean_json_text(text):
    """
    Handle text that came from JSON and might have JSON escaping.
    This handles the case of text like: "the sky isn\\'t falling"
    """
    # First attempt to clean JSON-style escapes
    try:
        # Try to treat the string as if it were a JSON string
        if '\\' in text:
            # Create a valid JSON string with the text as content
            json_str = json.dumps({"text": text})
            # Parse it back to get properly unescaped text
            parsed = json.loads(json_str)
            return parsed["text"]
    except Exception:
        # If that fails, continue with the original text
        pass
    
    return text

def process_highlights(text):
    """
    Process highlight markers in text to create HTML highlighted text.
    Handles both standard format and alternative format.
    Also properly handles escaped quotes.
    """
    # Debug info
    # debug_text(text, "Before processing")
    
    # Clean JSON escaping
    text = clean_json_text(text)
    
    # Process highlight tags
    pattern1 = r'\[\[highlight_start\]\](.*?)\[\[highlight_end\]\]'
    replacement = r'<span class="highlight">\1</span>'
    highlighted_text = re.sub(pattern1, replacement, text)
    
    pattern2 = r'\[\[start_highlight\]\](.*?)\[\[end_highlight\]\]'
    highlighted_text = re.sub(pattern2, replacement, highlighted_text)
    
    # Debug info
    # debug_text(highlighted_text, "After processing")
    
    return highlighted_text

def process_table_with_highlights(markdown_table):
    """
    Special function to process markdown tables with highlights.
    Ensures the table structure is preserved while applying highlights.
    """
    # First, split the table into lines
    lines = markdown_table.strip().split('\n')
    processed_lines = []
    
    for line in lines:
        # Process highlights in each line
        processed_line = process_highlights(line)
        processed_lines.append(processed_line)
    
    return convert_markdown_table_to_html('\n'.join(processed_lines))

def convert_markdown_table_to_html(markdown_text):
    """
    Converts a markdown table to an HTML table.
    """
    # Clean JSON escaping
    markdown_text = clean_json_text(markdown_text)
    
    lines = markdown_text.strip().split('\n')
    table_lines = [line for line in lines if line.strip().startswith('|')]
    
    if len(table_lines) < 2:  # Need at least header and separator
        return markdown_text  # Return original if not a proper table
    
    html = '<table class="md-table">'
    
    # Check if we have a header row
    if len(table_lines) >= 2 and '---' in table_lines[1]:
        # Process header
        header_cells = table_lines[0].split('|')[1:-1] if table_lines[0].strip().endswith('|') else table_lines[0].split('|')[1:]
        html += '<thead><tr>'
        for cell in header_cells:
            # Process highlights in the cell
            processed_cell = process_highlights(cell.strip())
            html += f'<th>{processed_cell}</th>'
        html += '</tr></thead>'
        
        # Process data rows (skip the separator row at index 1)
        html += '<tbody>'
        for line in table_lines[2:]:
            if not line.strip():
                continue
                
            cells = line.split('|')[1:-1] if line.strip().endswith('|') else line.split('|')[1:]
            html += '<tr>'
            for cell in cells:
                # Process highlights in the cell
                processed_cell = process_highlights(cell.strip())
                html += f'<td>{processed_cell}</td>'
            html += '</tr>'
        html += '</tbody>'
    else:
        # No header row, treat all rows as data
        html += '<tbody>'
        for line in table_lines:
            if not line.strip():
                continue
                
            cells = line.split('|')[1:-1] if line.strip().endswith('|') else line.split('|')[1:]
            html += '<tr>'
            for cell in cells:
                # Process highlights in the cell
                processed_cell = process_highlights(cell.strip())
                html += f'<td>{processed_cell}</td>'
            html += '</tr>'
        html += '</tbody>'
    
    html += '</table>'
    return html

def get_context_html(example, show_full=False):
    """
    Formats the context chunks into an HTML string for display using specific CSS classes.
    Includes an alert for insufficient context and applies highlighting.
    
    Parameters:
    - example: The example data containing contexts
    - show_full: Boolean indicating whether to show full context
    """
    html = ""

    # Add insufficient context warning if needed
    if example.get("insufficient", False):
        insufficient_reason = example.get("insufficient_reason", "")
        reason_html = f"<p>{insufficient_reason}</p>" if insufficient_reason else "<p>The context may not contain enough information to fully answer the question, or the question might be ambiguous. Models should ideally indicate this limitation or refuse to answer.</p>"
        
        html += f"""
        <div class="insufficient-alert">
            <strong>
                <svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" style="vertical-align: middle; margin-right: 5px;">
                    <path d="m21.73 18-8-14a2 2 0 0 0-3.48 0l-8 14A2 2 0 0 0 4 21h16a2 2 0 0 0 1.73-3Z"></path>
                    <line x1="12" y1="9" x2="12" y2="13"></line>
                    <line x1="12" y1="17" x2="12.01" y2="17"></line>
                </svg>
                Insufficient Context
            </strong>
            {reason_html}
        </div>
        """

    # Create container div for all context items
    html += '<div class="context-items-container">'
    
    # Determine which context to display based on show_full flag
    if show_full and "full_contexts" in example and example["full_contexts"]:
        # If showing full context, create individual items for each chunk without headers
        for context_item in example["full_contexts"]:
            context_text = context_item.get('content', '')
            
            # Check for markdown table format (both standard and newline format)
            if '|' in context_text and ('\n|' in context_text or '\n-' in context_text):
                # Process as a table
                html += f'<div class="context-item">{process_table_with_highlights(context_text)}</div>'
            else:
                # Regular text content - process highlights
                processed_text = process_highlights(context_text)
                html += f'<div class="context-item">{processed_text}</div>'
    else:
        # Show the highlighted context items
        if "contexts" in example and example["contexts"]:
            for context_item in example["contexts"]:
                chunk_num = context_item.get('chunk_num', '')
                context_text = context_item.get('content', '')
                is_primary = context_item.get('is_primary', False)
                
                # Add appropriate class for primary chunks
                extra_class = " primary-context" if is_primary else ""
                
                # Check for markdown table format
                if '|' in context_text and ('\n|' in context_text or '\n-' in context_text):
                    # Process as a table
                    html += f'<div class="context-item{extra_class}">{process_table_with_highlights(context_text)}</div>'
                else:
                    # Regular text with potential highlights
                    processed_text = process_highlights(context_text)
                    html += f'<div class="context-item{extra_class}">{processed_text}</div>'
        else:
            # If no contexts available, show a message
            html += '<div class="context-item">No context available. Try toggling to full context view.</div>'
    
    # Close the container div
    html += '</div>'

    return html