525GradioApp / utils /text_dataset_parser.py
Ryan
update
360146a
"""
Utility functions for parsing text-based dataset files for LLM response comparator.
"""
import re
from pathlib import Path
def parse_text_file(file_path):
"""
Parse a text file to extract prompt, response1, model1, response2, and model2.
Format:
- \prompt= followed by the prompt text
- \response1= followed by the first model's response
- \model1= followed by the first model's name
- \response2= followed by the second model's response
- \model2= followed by the second model's name
Args:
file_path (str): Path to the text file.
Returns:
dict: Dictionary with prompt, response1, model1, response2, and model2.
"""
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
# Extract sections using regular expressions
prompt = re.search(r'\\prompt=(.*?)(?=\\response1=|$)', content, re.DOTALL)
response1 = re.search(r'\\response1=(.*?)(?=\\model1=|$)', content, re.DOTALL)
model1 = re.search(r'\\model1=(.*?)(?=\\response2=|$)', content, re.DOTALL)
response2 = re.search(r'\\response2=(.*?)(?=\\model2=|$)', content, re.DOTALL)
model2 = re.search(r'\\model2=(.*?)(?=$)', content, re.DOTALL)
return {
"prompt": prompt.group(1).strip() if prompt else "",
"response1": response1.group(1).strip() if response1 else "",
"model1": model1.group(1).strip() if model1 else "",
"response2": response2.group(1).strip() if response2 else "",
"model2": model2.group(1).strip() if model2 else ""
}
def load_text_file(file_path):
"""
Load a single text file as a dataset entry.
Args:
file_path (str): Path to the text file.
Returns:
dict: Dataset entry with prompt, response1, model1, response2, and model2.
"""
return parse_text_file(file_path)
def load_builtin_datasets(directory_path):
"""
Load all built-in datasets from a directory.
Args:
directory_path (str): Path to the directory containing text files.
Returns:
list: List of dataset entries.
"""
path = Path(directory_path)
text_files = list(path.glob('*.txt'))
return [parse_text_file(str(file_path)) for file_path in text_files]