JoeArmani commited on
Commit
c111c20
·
1 Parent(s): cc2577d

style refinements

Browse files
new_iteration/pipeline_config.py DELETED
@@ -1,9 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- @dataclass
4
- class PipelineConfig:
5
- """Minimal pipeline config."""
6
- max_length: int = 512 # max length if you want to skip long utterances
7
- min_turns: int = 4 # minimum total turns (user + assistant)
8
- min_user_words: int = 3 # min words in each user turn
9
- debug: bool = True # enable debug prints
 
 
 
 
 
 
 
 
 
 
new_iteration/run_taskmaster_processor.py CHANGED
@@ -1,30 +1,26 @@
1
  import json
2
  from datetime import datetime
3
  from pathlib import Path
4
-
5
- from data_augmentation.pipeline_config import PipelineConfig
6
- from data_augmentation.taskmaster_processor import TaskmasterProcessor
7
 
8
  def main():
9
- # 1) Setup config
10
- config = PipelineConfig(
 
 
11
  max_length=512,
12
  min_turns=4,
13
- min_user_words=3,
14
- debug=True
15
  )
16
-
17
- # 2) Instantiate processor
18
- base_dir = "datasets/taskmaster"
19
  processor = TaskmasterProcessor(config)
20
 
21
- # 3) Load raw dialogues
22
  dialogues = processor.load_taskmaster_dataset(base_dir=base_dir, max_examples=None)
23
 
24
- # 4) Filter & convert to final structure
25
  final_dialogues = processor.filter_and_convert(dialogues)
26
 
27
- # 5) Save final data
28
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
29
  output_dir = Path("processed_outputs")
30
  output_dir.mkdir(parents=True, exist_ok=True)
 
1
  import json
2
  from datetime import datetime
3
  from pathlib import Path
4
+ from taskmaster_processor import TaskmasterProcessor, RawDataProcessingConfig
 
 
5
 
6
  def main():
7
+ # Setup config and processor
8
+ base_dir = "datasets/taskmaster"
9
+ config = RawDataProcessingConfig(
10
+ debug=True,
11
  max_length=512,
12
  min_turns=4,
13
+ min_user_words=3
 
14
  )
 
 
 
15
  processor = TaskmasterProcessor(config)
16
 
17
+ # Load dialogues
18
  dialogues = processor.load_taskmaster_dataset(base_dir=base_dir, max_examples=None)
19
 
20
+ # Filter and convert dialogues
21
  final_dialogues = processor.filter_and_convert(dialogues)
22
 
23
+ # Save processed dialogues
24
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
25
  output_dir = Path("processed_outputs")
26
  output_dir.mkdir(parents=True, exist_ok=True)
new_iteration/taskmaster_processor.py CHANGED
@@ -19,27 +19,29 @@ class TaskmasterDialogue:
19
 
20
  def validate(self) -> bool:
21
  return bool(self.conversation_id and isinstance(self.turns, list))
22
-
23
- class PipelineConfig:
24
  """
25
- Example config structure. Adjust to your real config usage.
26
  """
27
  def __init__(
28
  self,
29
  debug: bool = True,
 
30
  min_turns: int = 2,
31
  min_user_words: int = 3
32
  ):
33
  self.debug = debug
 
34
  self.min_turns = min_turns
35
  self.min_user_words = min_user_words
36
 
37
  class TaskmasterProcessor:
38
  """
39
- Loads Taskmaster-1 dialogues, extracts domain from scenario,
40
- cleans + filters them, and outputs a pipeline-friendly format.
41
  """
42
- def __init__(self, config: PipelineConfig):
43
  self.config = config
44
 
45
  def load_taskmaster_dataset(
@@ -48,20 +50,20 @@ class TaskmasterProcessor:
48
  max_examples: Optional[int] = None
49
  ) -> List[TaskmasterDialogue]:
50
  """
51
- Load and parse Taskmaster JSON for self-dialogs & woz-dialogs (Taskmaster-1).
52
- Combines scenario text + conversation utterances to detect domain more robustly.
53
  """
54
  required_files = {
55
  "self-dialogs": "self-dialogs.json",
56
  "woz-dialogs": "woz-dialogs.json",
57
- "ontology": "ontology.json", # we might not actively use it, but let's expect it
58
  }
59
- # 1) Check for missing
 
60
  missing = [k for k, v in required_files.items() if not Path(base_dir, v).exists()]
61
  if missing:
62
  raise FileNotFoundError(f"Missing Taskmaster files: {missing}")
63
 
64
- # 2) Optionally load ontology
65
  ontology_path = Path(base_dir, required_files["ontology"])
66
  with open(ontology_path, 'r', encoding='utf-8') as f:
67
  ontology = json.load(f)
@@ -70,6 +72,7 @@ class TaskmasterProcessor:
70
 
71
  dialogues: List[TaskmasterDialogue] = []
72
 
 
73
  file_keys = ["self-dialogs", "woz-dialogs"]
74
  for file_key in file_keys:
75
  file_path = Path(base_dir, required_files[file_key])
@@ -81,14 +84,14 @@ class TaskmasterProcessor:
81
  instruction_id = d.get("instruction_id", None)
82
  scenario_text = d.get("scenario", "")
83
 
84
- # 3) Convert raw utterances
85
  utterances = d.get("utterances", [])
86
  turns = self._process_utterances(utterances)
87
 
88
- # 4) Domain detection
89
  domain = self._extract_domain(scenario_text, turns)
90
 
91
- # 5) Build the structured object
92
  new_dlg = TaskmasterDialogue(
93
  conversation_id=conversation_id,
94
  instruction_id=instruction_id,
@@ -115,7 +118,7 @@ class TaskmasterProcessor:
115
  txt = turn.get('text', '').lower()
116
  combined_text += " " + txt
117
 
118
- # Expanded domain patterns
119
  domain_patterns = {
120
  'restaurant': r'\b(restaurant|dining|food|reservation|table|menu|cuisine|eat|hungry)\b',
121
  'movie': r'\b(movie|cinema|film|ticket|showtime|theater|flick|screening)\b',
@@ -125,12 +128,12 @@ class TaskmasterProcessor:
125
  'auto': r'\b(car|vehicle|repair|maintenance|mechanic|oil\s?change)\b'
126
  }
127
 
128
- for dom, pattern in domain_patterns.items():
129
  if re.search(pattern, combined_text):
130
  # Optional: print if debug
131
  if self.config.debug:
132
- print(f"Matched domain: {dom} in scenario/turns")
133
- return dom
134
 
135
  if self.config.debug:
136
  print("No domain match, returning 'other'")
@@ -138,30 +141,26 @@ class TaskmasterProcessor:
138
 
139
  def _process_utterances(self, utterances: List[Dict[str, Any]]) -> List[Dict[str, str]]:
140
  """
141
- Convert raw utterances to a cleaned list of (speaker, text).
142
- Skip or remove lines that are numeric, too short, or empty.
143
  """
144
  cleaned_turns = []
145
  for utt in utterances:
146
  speaker = 'assistant' if utt.get('speaker') == 'ASSISTANT' else 'user'
147
  raw_text = utt.get('text', '').strip()
148
 
149
- # 1) Optional text cleaning
150
  text = self._clean_text(raw_text)
151
 
152
- # 2) Skip blank or numeric lines
153
- if not text:
154
- continue
155
- if self._is_numeric_line(text):
156
  continue
157
 
158
- # 3) If it's extremely short, skip.
159
- # (For example, "ok" or "yes" might be 1-2 words.)
160
- if len(text.split()) < 2:
161
- # Optionally keep "ok" or "yes" if you'd like, but let's skip them to keep quality up
162
  continue
163
 
164
- # 4) Append
165
  cleaned_turns.append({
166
  'speaker': speaker,
167
  'text': text
@@ -170,29 +169,24 @@ class TaskmasterProcessor:
170
 
171
  def _clean_text(self, text: str) -> str:
172
  """
173
- Basic text normalization: remove repeated punctuation, handle weird spacing, etc.
174
- Adjust to your needs.
175
  """
176
- # Example: collapse multiple spaces
177
  text = re.sub(r'\s+', ' ', text)
178
- # Example: remove trailing punctuation or repeated punctuation
179
- # e.g. "Sure!!!" => "Sure!"
180
  text = re.sub(r'([!?.,])\1+', r'\1', text)
181
  return text.strip()
182
 
183
  def _is_numeric_line(self, text: str) -> bool:
184
  """
185
  Return True if line is purely digits/punctuation/spaces,
186
- e.g. "4 3 13", "12345", "3.14". Adjust as needed.
187
  """
188
  pattern = r'^[\s]*[\d]+([\s\d.,]+)*[\s]*$'
189
  return bool(re.match(pattern, text))
190
 
191
  def filter_and_convert(self, dialogues: List[TaskmasterDialogue]) -> List[Dict]:
192
  """
193
- Filter out dialogues that don't meet min turns / min user words,
194
- then convert them to final pipeline format:
195
-
196
  {
197
  "dialogue_id": "...",
198
  "domain": "...",
@@ -204,12 +198,11 @@ class TaskmasterProcessor:
204
  if not dlg.validate():
205
  continue
206
 
207
- # If after cleaning, we have too few turns, skip
208
  if len(dlg.turns) < self.config.min_turns:
209
  continue
210
 
211
- # Check user-turn min words
212
- # E.g. user must have >= 3 words
213
  keep = True
214
  for turn in dlg.turns:
215
  if turn['speaker'] == 'user':
@@ -230,4 +223,4 @@ class TaskmasterProcessor:
230
 
231
  if self.config.debug:
232
  print(f"[TaskmasterProcessor] Filtered down to {len(results)} dialogues after cleaning.")
233
- return results
 
19
 
20
  def validate(self) -> bool:
21
  return bool(self.conversation_id and isinstance(self.turns, list))
22
+
23
+ class RawDataProcessingConfig:
24
  """
25
+ Simple config for raw dataset processing
26
  """
27
  def __init__(
28
  self,
29
  debug: bool = True,
30
+ max_length: int = 512,
31
  min_turns: int = 2,
32
  min_user_words: int = 3
33
  ):
34
  self.debug = debug
35
+ self.max_length = max_length
36
  self.min_turns = min_turns
37
  self.min_user_words = min_user_words
38
 
39
  class TaskmasterProcessor:
40
  """
41
+ Load Taskmaster-1 dialogues, extracts domain.
42
+ Clean, filter, save to pipeline format.
43
  """
44
+ def __init__(self, config: RawDataProcessingConfig):
45
  self.config = config
46
 
47
  def load_taskmaster_dataset(
 
50
  max_examples: Optional[int] = None
51
  ) -> List[TaskmasterDialogue]:
52
  """
53
+ Load & parse Taskmaster-1 JSON for self-dialogs & woz-dialogs.
 
54
  """
55
  required_files = {
56
  "self-dialogs": "self-dialogs.json",
57
  "woz-dialogs": "woz-dialogs.json",
58
+ "ontology": "ontology.json",
59
  }
60
+
61
+ # Check for missing files
62
  missing = [k for k, v in required_files.items() if not Path(base_dir, v).exists()]
63
  if missing:
64
  raise FileNotFoundError(f"Missing Taskmaster files: {missing}")
65
 
66
+ # Load ontology
67
  ontology_path = Path(base_dir, required_files["ontology"])
68
  with open(ontology_path, 'r', encoding='utf-8') as f:
69
  ontology = json.load(f)
 
72
 
73
  dialogues: List[TaskmasterDialogue] = []
74
 
75
+ # Process each file
76
  file_keys = ["self-dialogs", "woz-dialogs"]
77
  for file_key in file_keys:
78
  file_path = Path(base_dir, required_files[file_key])
 
84
  instruction_id = d.get("instruction_id", None)
85
  scenario_text = d.get("scenario", "")
86
 
87
+ # Handle utterances
88
  utterances = d.get("utterances", [])
89
  turns = self._process_utterances(utterances)
90
 
91
+ # Detect Domain
92
  domain = self._extract_domain(scenario_text, turns)
93
 
94
+ # Build the object
95
  new_dlg = TaskmasterDialogue(
96
  conversation_id=conversation_id,
97
  instruction_id=instruction_id,
 
118
  txt = turn.get('text', '').lower()
119
  combined_text += " " + txt
120
 
121
+ # Domain patterns
122
  domain_patterns = {
123
  'restaurant': r'\b(restaurant|dining|food|reservation|table|menu|cuisine|eat|hungry)\b',
124
  'movie': r'\b(movie|cinema|film|ticket|showtime|theater|flick|screening)\b',
 
128
  'auto': r'\b(car|vehicle|repair|maintenance|mechanic|oil\s?change)\b'
129
  }
130
 
131
+ for domain, pattern in domain_patterns.items():
132
  if re.search(pattern, combined_text):
133
  # Optional: print if debug
134
  if self.config.debug:
135
+ print(f"Matched domain: {domain} in scenario/turns")
136
+ return domain
137
 
138
  if self.config.debug:
139
  print("No domain match, returning 'other'")
 
141
 
142
  def _process_utterances(self, utterances: List[Dict[str, Any]]) -> List[Dict[str, str]]:
143
  """
144
+ Convert "utterances" to a cleaned List -> (speaker, text).
145
+ Skip lines that are numeric, too short, or empty.
146
  """
147
  cleaned_turns = []
148
  for utt in utterances:
149
  speaker = 'assistant' if utt.get('speaker') == 'ASSISTANT' else 'user'
150
  raw_text = utt.get('text', '').strip()
151
 
152
+ # Text cleaning
153
  text = self._clean_text(raw_text)
154
 
155
+ # Skip blank or numeric lines (e.g. "4 3 13")
156
+ if not text or self._is_numeric_line(text):
 
 
157
  continue
158
 
159
+ # Skip too short (no training benefit from 1-word user turns). E.g. "ok","yes", etc.
160
+ if len(text.split()) < 3:
 
 
161
  continue
162
 
163
+ # Add to cleaned turns
164
  cleaned_turns.append({
165
  'speaker': speaker,
166
  'text': text
 
169
 
170
  def _clean_text(self, text: str) -> str:
171
  """
172
+ Simple text normalization
 
173
  """
174
+ # Strip multiple spaces, remove unnecessary punctuation
175
  text = re.sub(r'\s+', ' ', text)
 
 
176
  text = re.sub(r'([!?.,])\1+', r'\1', text)
177
  return text.strip()
178
 
179
  def _is_numeric_line(self, text: str) -> bool:
180
  """
181
  Return True if line is purely digits/punctuation/spaces,
182
+ e.g. "4 3 13" and similar found in Taskmaster-1 dataset.
183
  """
184
  pattern = r'^[\s]*[\d]+([\s\d.,]+)*[\s]*$'
185
  return bool(re.match(pattern, text))
186
 
187
  def filter_and_convert(self, dialogues: List[TaskmasterDialogue]) -> List[Dict]:
188
  """
189
+ Filter out dialogues that don't meet min length requirements. Convert to pipeline format.
 
 
190
  {
191
  "dialogue_id": "...",
192
  "domain": "...",
 
198
  if not dlg.validate():
199
  continue
200
 
201
+ # Skip if too few turns
202
  if len(dlg.turns) < self.config.min_turns:
203
  continue
204
 
205
+ # Skip if any user turn is too short
 
206
  keep = True
207
  for turn in dlg.turns:
208
  if turn['speaker'] == 'user':
 
223
 
224
  if self.config.debug:
225
  print(f"[TaskmasterProcessor] Filtered down to {len(results)} dialogues after cleaning.")
226
+ return results