Spaces:

JoeArmani
/

csc525_retrieval_based_chatbot

Sleeping

App Files Files Community

JoeArmani commited on Jan 11

Commit

c111c20

1 Parent(s): cc2577d

style refinements

Browse files

Files changed (3) hide show

new_iteration/pipeline_config.py +0 -9
new_iteration/run_taskmaster_processor.py +9 -13
new_iteration/taskmaster_processor.py +36 -43

new_iteration/pipeline_config.py DELETED Viewed

@@ -1,9 +0,0 @@
-from dataclasses import dataclass
-@dataclass
-class PipelineConfig:
-    """Minimal pipeline config."""
-    max_length: int = 512        # max length if you want to skip long utterances
-    min_turns: int = 4           # minimum total turns (user + assistant)
-    min_user_words: int = 3      # min words in each user turn
-    debug: bool = True           # enable debug prints

new_iteration/run_taskmaster_processor.py CHANGED Viewed

@@ -1,30 +1,26 @@
 import json
 from datetime import datetime
 from pathlib import Path
-from data_augmentation.pipeline_config import PipelineConfig
-from data_augmentation.taskmaster_processor import TaskmasterProcessor
 def main():
-    # 1) Setup config
-    config = PipelineConfig(
         max_length=512,
         min_turns=4,
-        min_user_words=3,
-        debug=True
     )
-    # 2) Instantiate processor
-    base_dir = "datasets/taskmaster"
     processor = TaskmasterProcessor(config)
-    # 3) Load raw dialogues
     dialogues = processor.load_taskmaster_dataset(base_dir=base_dir, max_examples=None)
-    # 4) Filter & convert to final structure
     final_dialogues = processor.filter_and_convert(dialogues)
-    # 5) Save final data
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     output_dir = Path("processed_outputs")
     output_dir.mkdir(parents=True, exist_ok=True)

 import json
 from datetime import datetime
 from pathlib import Path
+from taskmaster_processor import TaskmasterProcessor, RawDataProcessingConfig
 def main():
+    # Setup config and processor
+    base_dir = "datasets/taskmaster"
+    config = RawDataProcessingConfig(
+        debug=True,
         max_length=512,
         min_turns=4,
+        min_user_words=3
     )
     processor = TaskmasterProcessor(config)
+    # Load dialogues
     dialogues = processor.load_taskmaster_dataset(base_dir=base_dir, max_examples=None)
+    # Filter and convert dialogues
     final_dialogues = processor.filter_and_convert(dialogues)
+    # Save processed dialogues
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     output_dir = Path("processed_outputs")
     output_dir.mkdir(parents=True, exist_ok=True)

new_iteration/taskmaster_processor.py CHANGED Viewed

@@ -19,27 +19,29 @@ class TaskmasterDialogue:
     def validate(self) -> bool:
         return bool(self.conversation_id and isinstance(self.turns, list))
-class PipelineConfig:
     """
-    Example config structure. Adjust to your real config usage.
     """
     def __init__(
         self,
         debug: bool = True,
         min_turns: int = 2,
         min_user_words: int = 3
     ):
         self.debug = debug
         self.min_turns = min_turns
         self.min_user_words = min_user_words
 class TaskmasterProcessor:
     """
-    Loads Taskmaster-1 dialogues, extracts domain from scenario,
-    cleans + filters them, and outputs a pipeline-friendly format.
     """
-    def __init__(self, config: PipelineConfig):
         self.config = config
     def load_taskmaster_dataset(
@@ -48,20 +50,20 @@ class TaskmasterProcessor:
         max_examples: Optional[int] = None
     ) -> List[TaskmasterDialogue]:
         """
-        Load and parse Taskmaster JSON for self-dialogs & woz-dialogs (Taskmaster-1).
-        Combines scenario text + conversation utterances to detect domain more robustly.
         """
         required_files = {
             "self-dialogs": "self-dialogs.json",
             "woz-dialogs": "woz-dialogs.json",
-            "ontology": "ontology.json",  # we might not actively use it, but let's expect it
         }
-        # 1) Check for missing
         missing = [k for k, v in required_files.items() if not Path(base_dir, v).exists()]
         if missing:
             raise FileNotFoundError(f"Missing Taskmaster files: {missing}")
-        # 2) Optionally load ontology
         ontology_path = Path(base_dir, required_files["ontology"])
         with open(ontology_path, 'r', encoding='utf-8') as f:
             ontology = json.load(f)
@@ -70,6 +72,7 @@ class TaskmasterProcessor:
         dialogues: List[TaskmasterDialogue] = []
         file_keys = ["self-dialogs", "woz-dialogs"]
         for file_key in file_keys:
             file_path = Path(base_dir, required_files[file_key])
@@ -81,14 +84,14 @@ class TaskmasterProcessor:
                 instruction_id = d.get("instruction_id", None)
                 scenario_text = d.get("scenario", "")
-                # 3) Convert raw utterances
                 utterances = d.get("utterances", [])
                 turns = self._process_utterances(utterances)
-                # 4) Domain detection
                 domain = self._extract_domain(scenario_text, turns)
-                # 5) Build the structured object
                 new_dlg = TaskmasterDialogue(
                     conversation_id=conversation_id,
                     instruction_id=instruction_id,
@@ -115,7 +118,7 @@ class TaskmasterProcessor:
             txt = turn.get('text', '').lower()
             combined_text += " " + txt
-        # Expanded domain patterns
         domain_patterns = {
             'restaurant': r'\b(restaurant|dining|food|reservation|table|menu|cuisine|eat|hungry)\b',
             'movie': r'\b(movie|cinema|film|ticket|showtime|theater|flick|screening)\b',
@@ -125,12 +128,12 @@ class TaskmasterProcessor:
             'auto': r'\b(car|vehicle|repair|maintenance|mechanic|oil\s?change)\b'
         }
-        for dom, pattern in domain_patterns.items():
             if re.search(pattern, combined_text):
                 # Optional: print if debug
                 if self.config.debug:
-                    print(f"Matched domain: {dom} in scenario/turns")
-                return dom
         if self.config.debug:
             print("No domain match, returning 'other'")
@@ -138,30 +141,26 @@ class TaskmasterProcessor:
     def _process_utterances(self, utterances: List[Dict[str, Any]]) -> List[Dict[str, str]]:
         """
-        Convert raw utterances to a cleaned list of (speaker, text).
-        Skip or remove lines that are numeric, too short, or empty.
         """
         cleaned_turns = []
         for utt in utterances:
             speaker = 'assistant' if utt.get('speaker') == 'ASSISTANT' else 'user'
             raw_text = utt.get('text', '').strip()
-            # 1) Optional text cleaning
             text = self._clean_text(raw_text)
-            # 2) Skip blank or numeric lines
-            if not text:
-                continue
-            if self._is_numeric_line(text):
                 continue
-            # 3) If it's extremely short, skip.
-            # (For example, "ok" or "yes" might be 1-2 words.)
-            if len(text.split()) < 2:
-                # Optionally keep "ok" or "yes" if you'd like, but let's skip them to keep quality up
                 continue
-            # 4) Append
             cleaned_turns.append({
                 'speaker': speaker,
                 'text': text
@@ -170,29 +169,24 @@ class TaskmasterProcessor:
     def _clean_text(self, text: str) -> str:
         """
-        Basic text normalization: remove repeated punctuation, handle weird spacing, etc.
-        Adjust to your needs.
         """
-        # Example: collapse multiple spaces
         text = re.sub(r'\s+', ' ', text)
-        # Example: remove trailing punctuation or repeated punctuation
-        # e.g. "Sure!!!" => "Sure!"
         text = re.sub(r'([!?.,])\1+', r'\1', text)
         return text.strip()
     def _is_numeric_line(self, text: str) -> bool:
         """
         Return True if line is purely digits/punctuation/spaces,
-        e.g. "4 3 13", "12345", "3.14". Adjust as needed.
         """
         pattern = r'^[\s]*[\d]+([\s\d.,]+)*[\s]*$'
         return bool(re.match(pattern, text))
     def filter_and_convert(self, dialogues: List[TaskmasterDialogue]) -> List[Dict]:
         """
-        Filter out dialogues that don't meet min turns / min user words,
-        then convert them to final pipeline format:
             {
               "dialogue_id": "...",
               "domain": "...",
@@ -204,12 +198,11 @@ class TaskmasterProcessor:
             if not dlg.validate():
                 continue
-            # If after cleaning, we have too few turns, skip
             if len(dlg.turns) < self.config.min_turns:
                 continue
-            # Check user-turn min words
-            # E.g. user must have >= 3 words
             keep = True
             for turn in dlg.turns:
                 if turn['speaker'] == 'user':
@@ -230,4 +223,4 @@ class TaskmasterProcessor:
         if self.config.debug:
             print(f"[TaskmasterProcessor] Filtered down to {len(results)} dialogues after cleaning.")
-        return results

     def validate(self) -> bool:
         return bool(self.conversation_id and isinstance(self.turns, list))
+class RawDataProcessingConfig:
     """
+    Simple config for raw dataset processing
     """
     def __init__(
         self,
         debug: bool = True,
+        max_length: int = 512,
         min_turns: int = 2,
         min_user_words: int = 3
     ):
         self.debug = debug
+        self.max_length = max_length
         self.min_turns = min_turns
         self.min_user_words = min_user_words
 class TaskmasterProcessor:
     """
+    Load Taskmaster-1 dialogues, extracts domain.
+    Clean, filter, save to pipeline format.
     """
+    def __init__(self, config: RawDataProcessingConfig):
         self.config = config
     def load_taskmaster_dataset(
         max_examples: Optional[int] = None
     ) -> List[TaskmasterDialogue]:
         """
+        Load & parse Taskmaster-1 JSON for self-dialogs & woz-dialogs.
         """
         required_files = {
             "self-dialogs": "self-dialogs.json",
             "woz-dialogs": "woz-dialogs.json",
+            "ontology": "ontology.json",
         }
+        # Check for missing files
         missing = [k for k, v in required_files.items() if not Path(base_dir, v).exists()]
         if missing:
             raise FileNotFoundError(f"Missing Taskmaster files: {missing}")
+        # Load ontology
         ontology_path = Path(base_dir, required_files["ontology"])
         with open(ontology_path, 'r', encoding='utf-8') as f:
             ontology = json.load(f)
         dialogues: List[TaskmasterDialogue] = []
+        # Process each file
         file_keys = ["self-dialogs", "woz-dialogs"]
         for file_key in file_keys:
             file_path = Path(base_dir, required_files[file_key])
                 instruction_id = d.get("instruction_id", None)
                 scenario_text = d.get("scenario", "")
+                # Handle utterances
                 utterances = d.get("utterances", [])
                 turns = self._process_utterances(utterances)
+                # Detect Domain
                 domain = self._extract_domain(scenario_text, turns)
+                # Build the object
                 new_dlg = TaskmasterDialogue(
                     conversation_id=conversation_id,
                     instruction_id=instruction_id,
             txt = turn.get('text', '').lower()
             combined_text += " " + txt
+        # Domain patterns
         domain_patterns = {
             'restaurant': r'\b(restaurant|dining|food|reservation|table|menu|cuisine|eat|hungry)\b',
             'movie': r'\b(movie|cinema|film|ticket|showtime|theater|flick|screening)\b',
             'auto': r'\b(car|vehicle|repair|maintenance|mechanic|oil\s?change)\b'
         }
+        for domain, pattern in domain_patterns.items():
             if re.search(pattern, combined_text):
                 # Optional: print if debug
                 if self.config.debug:
+                    print(f"Matched domain: {domain} in scenario/turns")
+                return domain
         if self.config.debug:
             print("No domain match, returning 'other'")
     def _process_utterances(self, utterances: List[Dict[str, Any]]) -> List[Dict[str, str]]:
         """
+        Convert "utterances" to a cleaned List -> (speaker, text).
+        Skip lines that are numeric, too short, or empty.
         """
         cleaned_turns = []
         for utt in utterances:
             speaker = 'assistant' if utt.get('speaker') == 'ASSISTANT' else 'user'
             raw_text = utt.get('text', '').strip()
+            # Text cleaning
             text = self._clean_text(raw_text)
+            # Skip blank or numeric lines (e.g. "4 3 13")
+            if not text or self._is_numeric_line(text):
                 continue
+            # Skip too short (no training benefit from 1-word user turns). E.g. "ok","yes", etc.
+            if len(text.split()) < 3:
                 continue
+            # Add to cleaned turns
             cleaned_turns.append({
                 'speaker': speaker,
                 'text': text
     def _clean_text(self, text: str) -> str:
         """
+        Simple text normalization
         """
+        # Strip multiple spaces, remove unnecessary punctuation
         text = re.sub(r'\s+', ' ', text)
         text = re.sub(r'([!?.,])\1+', r'\1', text)
         return text.strip()
     def _is_numeric_line(self, text: str) -> bool:
         """
         Return True if line is purely digits/punctuation/spaces,
+        e.g. "4 3 13" and similar found in Taskmaster-1 dataset.
         """
         pattern = r'^[\s]*[\d]+([\s\d.,]+)*[\s]*$'
         return bool(re.match(pattern, text))
     def filter_and_convert(self, dialogues: List[TaskmasterDialogue]) -> List[Dict]:
         """
+        Filter out dialogues that don't meet min length requirements. Convert to pipeline format.
             {
               "dialogue_id": "...",
               "domain": "...",
             if not dlg.validate():
                 continue
+            # Skip if too few turns
             if len(dlg.turns) < self.config.min_turns:
                 continue
+            # Skip if any user turn is too short
             keep = True
             for turn in dlg.turns:
                 if turn['speaker'] == 'user':
         if self.config.debug:
             print(f"[TaskmasterProcessor] Filtered down to {len(results)} dialogues after cleaning.")
+        return results