Spaces:

JoeArmani
/

csc525_retrieval_based_chatbot

Sleeping

csc525_retrieval_based_chatbot / new_iteration /taskmaster_processor.py

JoeArmani

reranker scoring

e5be70f 5 months ago

8.74 kB

	import os
	import re
	import json
	from pathlib import Path
	from typing import List, Dict, Optional, Any
	from dataclasses import dataclass, field

	@dataclass
	class TaskmasterDialogue:
	conversation_id: str
	instruction_id: Optional[str]
	scenario: Optional[str]
	domain: Optional[str]
	turns: List[Dict[str, Any]]
	original_metadata: Dict[str, Any] = field(default_factory=dict)

	def __str__(self):
	return f"TaskmasterDialogue(conversation_id={self.conversation_id}, turns={len(self.turns)} turns)"

	def validate(self) -> bool:
	return bool(self.conversation_id and isinstance(self.turns, list))

	class PipelineConfig:
	"""
	Example config structure. Adjust to your real config usage.
	"""
	def __init__(
	self,
	debug: bool = True,
	min_turns: int = 2,
	min_user_words: int = 3
	):
	self.debug = debug
	self.min_turns = min_turns
	self.min_user_words = min_user_words

	class TaskmasterProcessor:
	"""
	Loads Taskmaster-1 dialogues, extracts domain from scenario,
	cleans + filters them, and outputs a pipeline-friendly format.
	"""
	def __init__(self, config: PipelineConfig):
	self.config = config

	def load_taskmaster_dataset(
	self,
	base_dir: str,
	max_examples: Optional[int] = None
	) -> List[TaskmasterDialogue]:
	"""
	Load and parse Taskmaster JSON for self-dialogs & woz-dialogs (Taskmaster-1).
	Combines scenario text + conversation utterances to detect domain more robustly.
	"""
	required_files = {
	"self-dialogs": "self-dialogs.json",
	"woz-dialogs": "woz-dialogs.json",
	"ontology": "ontology.json", # we might not actively use it, but let's expect it
	}
	# 1) Check for missing
	missing = [k for k, v in required_files.items() if not Path(base_dir, v).exists()]
	if missing:
	raise FileNotFoundError(f"Missing Taskmaster files: {missing}")

	# 2) Optionally load ontology
	ontology_path = Path(base_dir, required_files["ontology"])
	with open(ontology_path, 'r', encoding='utf-8') as f:
	ontology = json.load(f)
	if self.config.debug:
	print(f"[TaskmasterProcessor] Loaded ontology with {len(ontology.keys())} top-level keys (unused).")

	dialogues: List[TaskmasterDialogue] = []

	file_keys = ["self-dialogs", "woz-dialogs"]
	for file_key in file_keys:
	file_path = Path(base_dir, required_files[file_key])
	with open(file_path, 'r', encoding='utf-8') as f:
	raw_data = json.load(f)

	for d in raw_data:
	conversation_id = d.get("conversation_id", "")
	instruction_id = d.get("instruction_id", None)
	scenario_text = d.get("scenario", "")

	# 3) Convert raw utterances
	utterances = d.get("utterances", [])
	turns = self._process_utterances(utterances)

	# 4) Domain detection
	domain = self._extract_domain(scenario_text, turns)

	# 5) Build the structured object
	new_dlg = TaskmasterDialogue(
	conversation_id=conversation_id,
	instruction_id=instruction_id,
	scenario=scenario_text,
	domain=domain,
	turns=turns,
	original_metadata={}
	)
	dialogues.append(new_dlg)

	if max_examples and len(dialogues) >= max_examples:
	break

	if self.config.debug:
	print(f"[TaskmasterProcessor] Loaded {len(dialogues)} total dialogues from Taskmaster-1.")
	return dialogues

	def _extract_domain(self, scenario: str, turns: List[Dict[str, str]]) -> str:
	"""
	Combine scenario text + all turn texts to detect domain more robustly.
	"""
	combined_text = scenario.lower()
	for turn in turns:
	txt = turn.get('text', '').lower()
	combined_text += " " + txt

	# Expanded domain patterns
	domain_patterns = {
	'restaurant': r'\b(restaurant\|dining\|food\|reservation\|table\|menu\|cuisine\|eat\|hungry)\b',
	'movie': r'\b(movie\|cinema\|film\|ticket\|showtime\|theater\|flick\|screening)\b',
	'ride_share': r'\b(ride\|taxi\|uber\|lyft\|car\s?service\|pickup\|dropoff\|driver)\b',
	'coffee': r'\b(coffee\|café\|cafe\|starbucks\|espresso\|latte\|mocha\|americano)\b',
	'pizza': r'\b(pizza\|delivery\|order\s?food\|pepperoni\|topping\|pizzeria\|slice)\b',
	'auto': r'\b(car\|vehicle\|repair\|maintenance\|mechanic\|oil\s?change)\b'
	}

	for dom, pattern in domain_patterns.items():
	if re.search(pattern, combined_text):
	# Optional: print if debug
	if self.config.debug:
	print(f"Matched domain: {dom} in scenario/turns")
	return dom

	if self.config.debug:
	print("No domain match, returning 'other'")
	return 'other'

	def _process_utterances(self, utterances: List[Dict[str, Any]]) -> List[Dict[str, str]]:
	"""
	Convert raw utterances to a cleaned list of (speaker, text).
	Skip or remove lines that are numeric, too short, or empty.
	"""
	cleaned_turns = []
	for utt in utterances:
	speaker = 'assistant' if utt.get('speaker') == 'ASSISTANT' else 'user'
	raw_text = utt.get('text', '').strip()

	# 1) Optional text cleaning
	text = self._clean_text(raw_text)

	# 2) Skip blank or numeric lines
	if not text:
	continue
	if self._is_numeric_line(text):
	continue

	# 3) If it's extremely short, skip.
	# (For example, "ok" or "yes" might be 1-2 words.)
	if len(text.split()) < 2:
	# Optionally keep "ok" or "yes" if you'd like, but let's skip them to keep quality up
	continue

	# 4) Append
	cleaned_turns.append({
	'speaker': speaker,
	'text': text
	})
	return cleaned_turns

	def _clean_text(self, text: str) -> str:
	"""
	Basic text normalization: remove repeated punctuation, handle weird spacing, etc.
	Adjust to your needs.
	"""
	# Example: collapse multiple spaces
	text = re.sub(r'\s+', ' ', text)
	# Example: remove trailing punctuation or repeated punctuation
	# e.g. "Sure!!!" => "Sure!"
	text = re.sub(r'([!?.,])\1+', r'\1', text)
	return text.strip()

	def _is_numeric_line(self, text: str) -> bool:
	"""
	Return True if line is purely digits/punctuation/spaces,
	e.g. "4 3 13", "12345", "3.14". Adjust as needed.
	"""
	pattern = r'^[\s][\d]+([\s\d.,]+)[\s]*$'
	return bool(re.match(pattern, text))

	def filter_and_convert(self, dialogues: List[TaskmasterDialogue]) -> List[Dict]:
	"""
	Filter out dialogues that don't meet min turns / min user words,
	then convert them to final pipeline format:

	{
	"dialogue_id": "...",
	"domain": "...",
	"turns": [ {"speaker": "user", "text": "..."}, ... ]
	}
	"""
	results = []
	for dlg in dialogues:
	if not dlg.validate():
	continue

	# If after cleaning, we have too few turns, skip
	if len(dlg.turns) < self.config.min_turns:
	continue

	# Check user-turn min words
	# E.g. user must have >= 3 words
	keep = True
	for turn in dlg.turns:
	if turn['speaker'] == 'user':
	words_count = len(turn['text'].split())
	if words_count < self.config.min_user_words:
	keep = False
	break

	if not keep:
	continue

	pipeline_dlg = {
	'dialogue_id': dlg.conversation_id,
	'domain': dlg.domain,
	'turns': dlg.turns # already cleaned
	}
	results.append(pipeline_dlg)

	if self.config.debug:
	print(f"[TaskmasterProcessor] Filtered down to {len(results)} dialogues after cleaning.")
	return results