Spaces:

RyanS974
/

525GradioApp

Sleeping

525GradioApp / utils /text_dataset_parser.py

Ryan

update

360146a 4 months ago

2.29 kB

	"""
	Utility functions for parsing text-based dataset files for LLM response comparator.
	"""
	import re
	from pathlib import Path

	def parse_text_file(file_path):
	"""
	Parse a text file to extract prompt, response1, model1, response2, and model2.

	Format:
	- \prompt= followed by the prompt text
	- \response1= followed by the first model's response
	- \model1= followed by the first model's name
	- \response2= followed by the second model's response
	- \model2= followed by the second model's name

	Args:
	file_path (str): Path to the text file.

	Returns:
	dict: Dictionary with prompt, response1, model1, response2, and model2.
	"""
	with open(file_path, 'r', encoding='utf-8') as file:
	content = file.read()

	# Extract sections using regular expressions
	prompt = re.search(r'\\prompt=(.*?)(?=\\response1=\|$)', content, re.DOTALL)
	response1 = re.search(r'\\response1=(.*?)(?=\\model1=\|$)', content, re.DOTALL)
	model1 = re.search(r'\\model1=(.*?)(?=\\response2=\|$)', content, re.DOTALL)
	response2 = re.search(r'\\response2=(.*?)(?=\\model2=\|$)', content, re.DOTALL)
	model2 = re.search(r'\\model2=(.*?)(?=$)', content, re.DOTALL)

	return {
	"prompt": prompt.group(1).strip() if prompt else "",
	"response1": response1.group(1).strip() if response1 else "",
	"model1": model1.group(1).strip() if model1 else "",
	"response2": response2.group(1).strip() if response2 else "",
	"model2": model2.group(1).strip() if model2 else ""
	}

	def load_text_file(file_path):
	"""
	Load a single text file as a dataset entry.

	Args:
	file_path (str): Path to the text file.

	Returns:
	dict: Dataset entry with prompt, response1, model1, response2, and model2.
	"""
	return parse_text_file(file_path)

	def load_builtin_datasets(directory_path):
	"""
	Load all built-in datasets from a directory.

	Args:
	directory_path (str): Path to the directory containing text files.

	Returns:
	list: List of dataset entries.
	"""
	path = Path(directory_path)
	text_files = list(path.glob('*.txt'))
	return [parse_text_file(str(file_path)) for file_path in text_files]