AhmadMustafa commited on
Commit
8498900
·
1 Parent(s): 37287c3

update: SI prompt and Transcript Processor

Browse files
Files changed (1) hide show
  1. app.py +103 -174
app.py CHANGED
@@ -19,14 +19,19 @@ class TranscriptSegment:
19
 
20
 
21
  class TranscriptProcessor:
22
- def __init__(self, transcript_file: str = None, transcript_data: dict = None):
 
 
 
 
 
 
23
  self.transcript_file = transcript_file
24
  self.transcript_data = transcript_data
25
  self.formatted_transcript = None
26
  self.segments = []
27
- self.text_windows = []
28
- self.window_size = 2
29
  self.speaker_mapping = {}
 
30
  if self.transcript_file:
31
  self._load_transcript()
32
  elif self.transcript_data:
@@ -37,7 +42,9 @@ class TranscriptProcessor:
37
  )
38
 
39
  self._process_transcript()
40
- self.map_speaker_ids_to_names()
 
 
41
 
42
  def _load_transcript(self) -> None:
43
  """Load the transcript JSON file."""
@@ -51,10 +58,7 @@ class TranscriptProcessor:
51
  return f"{minutes:02d}:{seconds:02d}"
52
 
53
  def _process_transcript(self) -> None:
54
- """Process the transcript into segments with speaker information and create a formatted version with timestamps."""
55
  results = self.transcript_data["results"]
56
-
57
- # Process into segments
58
  for segment in results["speaker_labels"]["segments"]:
59
  speaker_id = segment.get("speaker_label", segment.get("speakerlabel", ""))
60
  speaker_id = (
@@ -64,55 +68,59 @@ class TranscriptProcessor:
64
  start_time = float(segment.get("start_time", 0))
65
  end_time = float(segment.get("end_time", 0))
66
 
67
- items = [
68
- item
69
- for item in results["items"]
70
- if "start_time" in item
71
- and float(item["start_time"]) >= start_time
72
- and float(item["start_time"]) < end_time
73
- and item["type"] == "pronunciation"
74
- ]
75
-
76
- words = [item["alternatives"][0]["content"] for item in items]
77
- if words:
78
- self.segments.append(
79
- TranscriptSegment(
80
- speaker_id=speaker_id,
81
- start_time=start_time,
82
- end_time=end_time,
83
- text=" ".join(words),
84
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  )
 
86
 
 
 
87
  formatted_segments = []
88
  for seg in self.segments:
89
  start_time_str = self._format_time(seg.start_time)
90
  end_time_str = self._format_time(seg.end_time)
 
 
91
  formatted_segments.append(
92
  f"time_stamp: {start_time_str}-{end_time_str}\n"
93
- f"spk {seg.speaker_id}: {seg.text}\n"
94
  )
95
-
96
  self.formatted_transcript = "\n".join(formatted_segments)
97
 
98
- # Create sliding windows of text for better matching
99
- for i in range(len(self.segments)):
100
- # Combine current segment with next segments within window
101
- window_segments = self.segments[i : i + self.window_size]
102
- combined_text = " ".join(seg.text for seg in window_segments)
103
- if window_segments:
104
- self.text_windows.append(
105
- {
106
- "text": combined_text,
107
- "start_time": window_segments[0].start_time,
108
- "end_time": window_segments[-1].end_time,
109
- }
110
- )
111
-
112
  def map_speaker_ids_to_names(self) -> None:
113
  """Map speaker IDs to names based on introductions in the transcript."""
114
  try:
115
-
116
  transcript = self.formatted_transcript
117
 
118
  prompt = (
@@ -135,7 +143,6 @@ class TranscriptProcessor:
135
  try:
136
  self.speaker_mapping = json.loads(response_text)
137
  except json.JSONDecodeError:
138
- # extract left most and right most {}
139
  response_text = response_text[
140
  response_text.find("{") : response_text.rfind("}") + 1
141
  ]
@@ -144,124 +151,31 @@ class TranscriptProcessor:
144
  except json.JSONDecodeError:
145
  print("Error parsing speaker mapping JSON.")
146
  self.speaker_mapping = {}
 
 
147
  for segment in self.segments:
148
  spk_id = f"spk_{segment.speaker_id}"
149
  speaker_name = self.speaker_mapping.get(spk_id, spk_id)
150
  segment.speaker_name = speaker_name
151
 
152
- # Recreate the formatted transcript with speaker names
153
- formatted_segments = []
154
- for seg in self.segments:
155
- start_time_str = self._format_time(seg.start_time)
156
- end_time_str = self._format_time(seg.end_time)
157
- formatted_segments.append(
158
- f"time_stamp: {start_time_str}-{end_time_str}\n"
159
- f"{seg.speaker_name}: {seg.text}\n"
160
- )
161
- self.formatted_transcript = "\n".join(formatted_segments)
162
 
163
  except Exception as e:
164
  print(f"Error mapping speaker IDs to names: {str(e)}")
165
  self.speaker_mapping = {}
166
 
167
- def correct_speaker_mapping_with_agenda(self, url: str) -> None:
168
- """Fetch agenda from a URL and correct the speaker mapping using OpenAI."""
169
- try:
170
-
171
- response = requests.get(url)
172
- response.raise_for_status()
173
- html_content = response.text
174
-
175
- # Parse the HTML to find the desired description
176
- soup = BeautifulSoup(html_content, "html.parser")
177
- description_tag = soup.find(
178
- "script", {"type": "application/ld+json"}
179
- ) # Find the ld+json metadata block
180
- agenda = ""
181
-
182
- if description_tag:
183
- # Extract the JSON content
184
- json_data = json.loads(description_tag.string)
185
- if "description" in json_data:
186
- agenda = json_data["description"]
187
- else:
188
- print("Agenda description not found in the JSON metadata.")
189
- else:
190
- print("No structured data (ld+json) found.")
191
-
192
- if not agenda:
193
- print("No agenda found in the structured metadata. Trying meta tags.")
194
-
195
- # Fallback: Use meta description if ld+json doesn't have it
196
- meta_description = soup.find("meta", {"name": "description"})
197
- agenda = meta_description["content"] if meta_description else ""
198
-
199
- if not agenda:
200
- print("No agenda found in any description tags.")
201
- return
202
-
203
- print(self.speaker_mapping)
204
-
205
- prompt = (
206
- f"Given the original speaker mapping {self.speaker_mapping}, agenda:\n{agenda}, and the transcript: {self.formatted_transcript}\n\n"
207
- "Some speaker names in the mapping might have spelling errors or be incomplete."
208
- "Remember that the content in agenda is accurate and transcript can have errors so prioritize the spellings and names in the agenda content."
209
- "If the speaker name and introduction is similar to the agenda, update the speaker name in the mapping."
210
- "Please correct the names based on the agenda. Return the corrected mapping in JSON format as "
211
- "{'spk_0': 'Correct Name', 'spk_1': 'Correct Name', ...}."
212
- "You should only update the name if the name sounds very similar, or there is a good spelling overlap/ The Speaker Introduction matches the description of the Talk from Agends. If the name is totally unrelated, keep the original name."
213
- "You should always include all the speakers in the mapping from the original mapping, even if you don't update their names. i.e if there are 4 speakers in original mapping, new mapping should have 4 speakers always, ignore all the other spekaers in the agenda. I REPEAT DO NOT ADD OTHER NEW SPEAKERS IN THE MAPPING."
214
- )
215
-
216
- client = OpenAI()
217
-
218
- completion = client.chat.completions.create(
219
- model="gpt-4o-mini",
220
- messages=[
221
- {"role": "system", "content": "You are a helpful assistant."},
222
- {"role": "user", "content": prompt},
223
- ],
224
- temperature=0,
225
  )
226
-
227
- response_text = completion.choices[0].message.content.strip()
228
- try:
229
- corrected_mapping = json.loads(response_text)
230
- except Exception:
231
- response_text = response_text[
232
- response_text.find("{") : response_text.rfind("}") + 1
233
- ]
234
- try:
235
- corrected_mapping = json.loads(response_text)
236
- except json.JSONDecodeError:
237
- print(
238
- "Error parsing corrected speaker mapping JSON, keeping the original mapping."
239
- )
240
- corrected_mapping = self.speaker_mapping
241
- # Update the speaker mapping with corrected names
242
- self.speaker_mapping = corrected_mapping
243
- print("Corrected Speaker Mapping:", self.speaker_mapping)
244
-
245
- # Update the transcript segments with corrected names
246
- for segment in self.segments:
247
- spk_id = f"spk_{segment.speaker_id}"
248
- segment.speaker_name = self.speaker_mapping.get(spk_id, spk_id)
249
-
250
- # Recreate the formatted transcript with corrected names
251
- formatted_segments = []
252
- for seg in self.segments:
253
- start_time_str = self._format_time(seg.start_time)
254
- end_time_str = self._format_time(seg.end_time)
255
- formatted_segments.append(
256
- f"time_stamp: {start_time_str}-{end_time_str}\n"
257
- f"{seg.speaker_name}: {seg.text}\n"
258
- )
259
- self.formatted_transcript = "\n".join(formatted_segments)
260
-
261
- except requests.exceptions.RequestException as e:
262
- print(f" ching agenda from URL: {str(e)}")
263
- except Exception as e:
264
- print(f"Error correcting speaker mapping: {str(e)}")
265
 
266
  def get_transcript(self) -> str:
267
  """Return the formatted transcript with speaker names."""
@@ -333,7 +247,6 @@ def get_initial_analysis(
333
  link_start = "http"
334
  else:
335
  link_start = "https"
336
- print(uid)
337
  if ct == "si": # street interview
338
  prompt = f"""This is a transcript for a street interview. Call Details are as follows:
339
  User ID UID: {uid}
@@ -381,6 +294,22 @@ Total takes: 2
381
  - [Take 1. <div id='topic' style="display: inline"> 10s at 09:45]({link_start}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{585}}&et={{595}}&uid={{uid}})
382
  - [Take 1. <div id='topic' style="display: inline"> 20s at 25:45]({link_start}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{1245}}&et={{1265}}&uid={{uid}}))
383
  - [Take 3 (Best). <div id='topic' style="display: inline"> 5s at 10:13 </div>]({link_start}://roll.ai/colab/1234aq_12314/51234151?st=613&et=618&uid=82314)"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
  else:
385
  prompt = f"""Call Details:
386
  User ID: {uid}
@@ -420,28 +349,27 @@ Format requirements:
420
  - Topics should be self-contained discussions within the timestamp
421
  - Skip speakers if fewer than 2 compelling topics found
422
  """
423
- print(prompt)
424
- completion = client.chat.completions.create(
425
- model="gpt-4o-mini",
426
- messages=[
427
- {
428
- "role": "system",
429
- "content": f"""You are analyzing a transcript for Call ID: {cid}, Session ID: {rsid}, Origin: {origin}, Call Type: {ct}.
430
-
431
- CORE REQUIREMENTS:
432
- 1. TIMESTAMPS: Each clip must contain ONLY the specified speaker's dialogue about a single topic. No overlapping dialogue from other speakers. YOU NEED TO BE VERY CAREFUL ABOUT THIS RULE. YOU HAVE THE TRANSCRIPT AND YOU CAN SEE WHO IS SPEAKING AT WHAT TIME SO BE VERY VERY CARAEFUL AND ONLY INCLUDE THE DIALOGUE OF THE SPEAKER YOU ARE MAKING THE CLIP FOR.
433
- 2. DURATION: Clips should be between 20-60 seconds long.
434
- 3. CONTENT: Select engaging, viral-worthy topics. Avoid mundane or irrelevant content
435
- 4. COVERAGE: Minimum 2 topics per speaker, aim for 3 if good content exists.
436
- 5. YOU CAN IGNORE THE HOST IF NO COMPELLING CONTENT IS FOUND.
437
-
438
- YOU SHOULD Prioritize accuracy in timestamp at every cost.""",
439
- },
440
- {"role": "user", "content": prompt},
441
- ],
442
- stream=True,
443
- temperature=0.5,
444
- )
445
 
446
  collected_messages = []
447
  # Iterate through the stream
@@ -815,7 +743,6 @@ def create_chat_interface():
815
  ct = request.query_params.get("ct", None)
816
  turl = request.query_params.get("turl", None)
817
  uid = request.query_params.get("uid", None)
818
- print(uid)
819
 
820
  required_params = ["cid", "rsid", "origin", "ct", "turl", "uid"]
821
  missing_params = [
@@ -834,7 +761,9 @@ def create_chat_interface():
834
  try:
835
  transcript_data = get_transcript_for_url(turl)
836
  transcript_processor = TranscriptProcessor(
837
- transcript_data=transcript_data
 
 
838
  )
839
 
840
  # Initialize with empty message
 
19
 
20
 
21
  class TranscriptProcessor:
22
+ def __init__(
23
+ self,
24
+ transcript_file: str = None,
25
+ transcript_data: dict = None,
26
+ max_segment_duration: int = None,
27
+ call_type: str = "le",
28
+ ):
29
  self.transcript_file = transcript_file
30
  self.transcript_data = transcript_data
31
  self.formatted_transcript = None
32
  self.segments = []
 
 
33
  self.speaker_mapping = {}
34
+ self.max_segment_duration = max_segment_duration
35
  if self.transcript_file:
36
  self._load_transcript()
37
  elif self.transcript_data:
 
42
  )
43
 
44
  self._process_transcript()
45
+ self._create_formatted_transcript() # Create initial formatted transcript
46
+ if call_type != "si":
47
+ self.map_speaker_ids_to_names()
48
 
49
  def _load_transcript(self) -> None:
50
  """Load the transcript JSON file."""
 
58
  return f"{minutes:02d}:{seconds:02d}"
59
 
60
  def _process_transcript(self) -> None:
 
61
  results = self.transcript_data["results"]
 
 
62
  for segment in results["speaker_labels"]["segments"]:
63
  speaker_id = segment.get("speaker_label", segment.get("speakerlabel", ""))
64
  speaker_id = (
 
68
  start_time = float(segment.get("start_time", 0))
69
  end_time = float(segment.get("end_time", 0))
70
 
71
+ if self.max_segment_duration:
72
+ current_start = start_time
73
+ while current_start < end_time:
74
+ current_end = min(
75
+ current_start + self.max_segment_duration, end_time
 
 
 
 
 
 
 
 
 
 
 
 
76
  )
77
+ self._create_segment(
78
+ speaker_id, current_start, current_end, results["items"]
79
+ )
80
+ current_start = current_end
81
+ else:
82
+ self._create_segment(speaker_id, start_time, end_time, results["items"])
83
+
84
+ def _create_segment(
85
+ self, speaker_id: str, start: float, end: float, items: list
86
+ ) -> None:
87
+ matching_items = [
88
+ item
89
+ for item in items
90
+ if "start_time" in item
91
+ and float(item["start_time"]) >= start
92
+ and float(item["start_time"]) < end
93
+ and item["type"] == "pronunciation"
94
+ ]
95
+
96
+ words = [item["alternatives"][0]["content"] for item in matching_items]
97
+ if words:
98
+ self.segments.append(
99
+ TranscriptSegment(
100
+ speaker_id=speaker_id,
101
+ start_time=start,
102
+ end_time=end,
103
+ text=" ".join(words),
104
  )
105
+ )
106
 
107
+ def _create_formatted_transcript(self) -> None:
108
+ """Create formatted transcript with default speaker labels."""
109
  formatted_segments = []
110
  for seg in self.segments:
111
  start_time_str = self._format_time(seg.start_time)
112
  end_time_str = self._format_time(seg.end_time)
113
+ # Use default speaker label (spk_X) if no mapping exists
114
+ speaker_label = f"spk_{seg.speaker_id}"
115
  formatted_segments.append(
116
  f"time_stamp: {start_time_str}-{end_time_str}\n"
117
+ f"{speaker_label}: {seg.text}\n"
118
  )
 
119
  self.formatted_transcript = "\n".join(formatted_segments)
120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  def map_speaker_ids_to_names(self) -> None:
122
  """Map speaker IDs to names based on introductions in the transcript."""
123
  try:
 
124
  transcript = self.formatted_transcript
125
 
126
  prompt = (
 
143
  try:
144
  self.speaker_mapping = json.loads(response_text)
145
  except json.JSONDecodeError:
 
146
  response_text = response_text[
147
  response_text.find("{") : response_text.rfind("}") + 1
148
  ]
 
151
  except json.JSONDecodeError:
152
  print("Error parsing speaker mapping JSON.")
153
  self.speaker_mapping = {}
154
+
155
+ # Update segments with speaker names and recreate formatted transcript
156
  for segment in self.segments:
157
  spk_id = f"spk_{segment.speaker_id}"
158
  speaker_name = self.speaker_mapping.get(spk_id, spk_id)
159
  segment.speaker_name = speaker_name
160
 
161
+ self._create_formatted_transcript_with_names()
 
 
 
 
 
 
 
 
 
162
 
163
  except Exception as e:
164
  print(f"Error mapping speaker IDs to names: {str(e)}")
165
  self.speaker_mapping = {}
166
 
167
+ def _create_formatted_transcript_with_names(self) -> None:
168
+ """Create formatted transcript with mapped speaker names."""
169
+ formatted_segments = []
170
+ for seg in self.segments:
171
+ start_time_str = self._format_time(seg.start_time)
172
+ end_time_str = self._format_time(seg.end_time)
173
+ speaker_name = getattr(seg, "speaker_name", f"spk_{seg.speaker_id}")
174
+ formatted_segments.append(
175
+ f"time_stamp: {start_time_str}-{end_time_str}\n"
176
+ f"{speaker_name}: {seg.text}\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  )
178
+ self.formatted_transcript = "\n".join(formatted_segments)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
 
180
  def get_transcript(self) -> str:
181
  """Return the formatted transcript with speaker names."""
 
247
  link_start = "http"
248
  else:
249
  link_start = "https"
 
250
  if ct == "si": # street interview
251
  prompt = f"""This is a transcript for a street interview. Call Details are as follows:
252
  User ID UID: {uid}
 
294
  - [Take 1. <div id='topic' style="display: inline"> 10s at 09:45]({link_start}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{585}}&et={{595}}&uid={{uid}})
295
  - [Take 1. <div id='topic' style="display: inline"> 20s at 25:45]({link_start}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{1245}}&et={{1265}}&uid={{uid}}))
296
  - [Take 3 (Best). <div id='topic' style="display: inline"> 5s at 10:13 </div>]({link_start}://roll.ai/colab/1234aq_12314/51234151?st=613&et=618&uid=82314)"""
297
+ completion = client.chat.completions.create(
298
+ model="gpt-4o-mini",
299
+ messages=[
300
+ {
301
+ "role": "system",
302
+ "content": f"""You are analyzing a transcript for Call ID: {cid}, Session ID: {rsid}, Origin: {origin}, Call Type: {ct}.
303
+ CORE REQUIREMENT:
304
+ - TIMESTAMPS: A speaker can repeat the answer to a question multiple times. You need to pick the last answer very carefully and choose that as best take. Make sure that that same answer is not repeated again after the best answer.
305
+
306
+ YOU SHOULD Prioritize accuracy in timestamp at every cost. Read the Transcript carefully and decide where an answer starts and ends. You will have speaker labels so you need to be very sharp.""",
307
+ },
308
+ {"role": "user", "content": prompt},
309
+ ],
310
+ stream=True,
311
+ temperature=0.5,
312
+ )
313
  else:
314
  prompt = f"""Call Details:
315
  User ID: {uid}
 
349
  - Topics should be self-contained discussions within the timestamp
350
  - Skip speakers if fewer than 2 compelling topics found
351
  """
352
+ completion = client.chat.completions.create(
353
+ model="gpt-4o-mini",
354
+ messages=[
355
+ {
356
+ "role": "system",
357
+ "content": f"""You are analyzing a transcript for Call ID: {cid}, Session ID: {rsid}, Origin: {origin}, Call Type: {ct}.
358
+
359
+ CORE REQUIREMENTS:
360
+ 1. TIMESTAMPS: Each clip must contain ONLY the specified speaker's dialogue about a single topic. No overlapping dialogue from other speakers. YOU NEED TO BE VERY CAREFUL ABOUT THIS RULE. YOU HAVE THE TRANSCRIPT AND YOU CAN SEE WHO IS SPEAKING AT WHAT TIME SO BE VERY VERY CARAEFUL AND ONLY INCLUDE THE DIALOGUE OF THE SPEAKER YOU ARE MAKING THE CLIP FOR.
361
+ 2. DURATION: Clips should be between 20-60 seconds long.
362
+ 3. CONTENT: Select engaging, viral-worthy topics. Avoid mundane or irrelevant content
363
+ 4. COVERAGE: Minimum 2 topics per speaker, aim for 3 if good content exists.
364
+ 5. YOU CAN IGNORE THE HOST IF NO COMPELLING CONTENT IS FOUND.
365
+
366
+ YOU SHOULD Prioritize accuracy in timestamp at every cost.""",
367
+ },
368
+ {"role": "user", "content": prompt},
369
+ ],
370
+ stream=True,
371
+ temperature=0.5,
372
+ )
 
373
 
374
  collected_messages = []
375
  # Iterate through the stream
 
743
  ct = request.query_params.get("ct", None)
744
  turl = request.query_params.get("turl", None)
745
  uid = request.query_params.get("uid", None)
 
746
 
747
  required_params = ["cid", "rsid", "origin", "ct", "turl", "uid"]
748
  missing_params = [
 
761
  try:
762
  transcript_data = get_transcript_for_url(turl)
763
  transcript_processor = TranscriptProcessor(
764
+ transcript_data=transcript_data,
765
+ max_segment_duration=10,
766
+ call_type=ct,
767
  )
768
 
769
  # Initialize with empty message