Spaces:

MicoGuild
/

Olubakka

Sleeping

Olubakka / src /utils.py

Sachi Wagaarachchi

debug: updated the pipeline

0b7ba67 28 days ago

4.16 kB

	"""
	Utility functions for pre-processing and post-processing in the chat application.
	"""

	def preprocess_chat_input(message, history, system_prompt=""):
	"""
	Pre-process chat input to prepare it for the model.

	Args:
	message (str): The current user message
	history (list): List of tuples containing (user_message, assistant_message) pairs
	system_prompt (str): Optional system prompt to guide the model's behavior

	Returns:
	dict: Formatted messages in the format expected by the tokenizer
	"""
	# Convert history from tuples to dict format expected by apply_chat_template
	formatted_history = []
	for user_msg, assistant_msg in history:
	formatted_history.append({"role": "user", "content": user_msg})
	formatted_history.append({"role": "assistant", "content": assistant_msg})

	# Add current message
	formatted_history.append({"role": "user", "content": message})

	# Add system message if provided
	if system_prompt.strip():
	messages = [{"role": "system", "content": system_prompt.strip()}] + formatted_history
	else:
	messages = formatted_history

	return messages


	def format_prompt(message, history, tokenizer, system_prompt=""):
	"""
	Format message and history into a prompt for Qwen models.

	Uses tokenizer.apply_chat_template if available, otherwise falls back to manual formatting.

	Args:
	message (str): The current user message
	history (list): List of tuples containing (user_message, assistant_message) pairs
	tokenizer: The model tokenizer
	system_prompt (str): Optional system prompt to guide the model's behavior

	Returns:
	str: Formatted prompt ready for the model
	"""
	# Get pre-processed messages
	messages = preprocess_chat_input(message, history, system_prompt)

	# Apply chat template if available
	if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
	return tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True,
	enable_thinking=True
	)
	else:
	# Fallback for base LMs without chat template
	return format_prompt_fallback(messages)


	def format_prompt_fallback(messages):
	"""
	Fallback prompt formatting for models without chat templates.

	Args:
	messages (list): List of message dictionaries with role and content

	Returns:
	str: Formatted prompt string
	"""
	prompt = ""

	# Add system message if present
	if messages and messages[0]['role'] == 'system':
	prompt = messages[0]['content'].strip() + "\n"
	messages = messages[1:]

	# Add conversation history
	for msg in messages:
	if msg['role'] == 'user':
	prompt += f"<\|User\|>: {msg['content'].strip()}\n"
	elif msg['role'] == 'assistant':
	prompt += f"<\|Assistant\|>: {msg['content'].strip()}\n"

	# Add final assistant prompt if needed
	if not prompt.strip().endswith("<\|Assistant\|>:"):
	prompt += "<\|Assistant\|>:"

	return prompt


	def prepare_generation_inputs(prompt, tokenizer, device):
	"""
	Prepare tokenized inputs for model generation.

	Args:
	prompt (str): The formatted prompt
	tokenizer: The model tokenizer
	device: The device to place tensors on

	Returns:
	dict: Tokenized inputs ready for model generation
	"""
	# Tokenize the prompt
	tokenized_inputs = tokenizer(prompt, return_tensors="pt")

	# Move tensors to the correct device
	inputs_on_device = {k: v.to(device) for k, v in tokenized_inputs.items()}

	return inputs_on_device


	def postprocess_response(response):
	"""
	Post-process the model's response.

	Args:
	response (str): The raw model response

	Returns:
	str: The processed response
	"""
	# Currently just returns the response as-is
	# This function can be expanded for additional post-processing steps
	return response