Spaces:

RyanS974
/

525GradioApp

Sleeping

File size: 2,288 Bytes

360146a

"""
Utility functions for parsing text-based dataset files for LLM response comparator.
"""
import re
from pathlib import Path

def parse_text_file(file_path):
    """
    Parse a text file to extract prompt, response1, model1, response2, and model2.
    
    Format:
    - \prompt= followed by the prompt text
    - \response1= followed by the first model's response
    - \model1= followed by the first model's name
    - \response2= followed by the second model's response
    - \model2= followed by the second model's name
    
    Args:
        file_path (str): Path to the text file.
        
    Returns:
        dict: Dictionary with prompt, response1, model1, response2, and model2.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Extract sections using regular expressions
    prompt = re.search(r'\\prompt=(.*?)(?=\\response1=|$)', content, re.DOTALL)
    response1 = re.search(r'\\response1=(.*?)(?=\\model1=|$)', content, re.DOTALL)
    model1 = re.search(r'\\model1=(.*?)(?=\\response2=|$)', content, re.DOTALL)
    response2 = re.search(r'\\response2=(.*?)(?=\\model2=|$)', content, re.DOTALL)
    model2 = re.search(r'\\model2=(.*?)(?=$)', content, re.DOTALL)
    
    return {
        "prompt": prompt.group(1).strip() if prompt else "",
        "response1": response1.group(1).strip() if response1 else "",
        "model1": model1.group(1).strip() if model1 else "",
        "response2": response2.group(1).strip() if response2 else "",
        "model2": model2.group(1).strip() if model2 else ""
    }

def load_text_file(file_path):
    """
    Load a single text file as a dataset entry.
    
    Args:
        file_path (str): Path to the text file.
        
    Returns:
        dict: Dataset entry with prompt, response1, model1, response2, and model2.
    """
    return parse_text_file(file_path)

def load_builtin_datasets(directory_path):
    """
    Load all built-in datasets from a directory.
    
    Args:
        directory_path (str): Path to the directory containing text files.
        
    Returns:
        list: List of dataset entries.
    """
    path = Path(directory_path)
    text_files = list(path.glob('*.txt'))
    return [parse_text_file(str(file_path)) for file_path in text_files]