AhmadMustafa commited on
Commit
eb33652
·
1 Parent(s): fecd16e

update: transcript processor

Browse files
Files changed (1) hide show
  1. app.py +237 -70
app.py CHANGED
@@ -59,51 +59,199 @@ class TranscriptProcessor:
59
 
60
  def _process_transcript(self) -> None:
61
  results = self.transcript_data["results"]
62
- for segment in results["speaker_labels"]["segments"]:
63
- speaker_id = segment.get("speaker_label", segment.get("speakerlabel", ""))
64
- speaker_id = (
65
- speaker_id.replace("spk_", "").replace("spk", "") if speaker_id else ""
66
- )
67
-
68
- start_time = float(segment.get("start_time", 0))
69
- end_time = float(segment.get("end_time", 0))
 
 
70
 
71
- if self.max_segment_duration:
72
- current_start = start_time
73
- while current_start < end_time:
74
- current_end = min(
75
- current_start + self.max_segment_duration, end_time
76
- )
77
- self._create_segment(
78
- speaker_id, current_start, current_end, results["items"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  )
80
- current_start = current_end
81
- else:
82
- self._create_segment(speaker_id, start_time, end_time, results["items"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
  def _create_segment(
85
  self, speaker_id: str, start: float, end: float, items: list
86
  ) -> None:
87
- matching_items = [
88
- item
89
- for item in items
90
- if "start_time" in item
91
- and float(item["start_time"]) >= start
92
- and float(item["start_time"]) < end
93
- and item["type"] == "pronunciation"
94
- ]
95
-
96
- words = [item["alternatives"][0]["content"] for item in matching_items]
97
- if words:
98
  self.segments.append(
99
  TranscriptSegment(
100
  speaker_id=speaker_id,
101
  start_time=start,
102
  end_time=end,
103
- text=" ".join(words),
104
  )
105
  )
106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  def _create_formatted_transcript(self) -> None:
108
  """Create formatted transcript with default speaker labels."""
109
  formatted_segments = []
@@ -314,13 +462,30 @@ Total takes: 2
314
  system_prompt = f"""You are analyzing a transcript for Call ID: {cid}, Session ID: {rsid}, Origin: {origin}, and Call Type: {ct}.
315
 
316
  CORE REQUIREMENTS:
317
- 1. TIMESTAMPS: Each clip must contain ONLY the specified speaker's dialogue about a single topic. No overlapping dialogue from other speakers. YOU NEED TO BE VERY CAREFUL ABOUT THIS RULE. YOU HAVE THE TRANSCRIPT AND YOU CAN SEE WHO IS SPEAKING AT WHAT TIME, SO BE VERY, VERY CAREFUL AND ONLY INCLUDE THE DIALOGUE OF THE SPEAKER YOU ARE MAKING THE CLIP FOR.
318
- 2. DURATION: Clips should be between 20-90 seconds long.
319
- 3. CONTENT: Select engaging, viral-worthy topics. Avoid mundane or irrelevant content.
320
- 4. COVERAGE: Minimum 2 topics per speaker, aim for 3 if good content exists.
321
- 5. YOU CAN IGNORE THE HOST IF NO COMPELLING CONTENT IS FOUND.
322
-
323
- YOU SHOULD prioritize accuracy in timestamps at all costs.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
  """
325
 
326
  user_prompt = f"""Call Details:
@@ -328,40 +493,34 @@ User ID: {uid}
328
  Call ID: {cid}
329
  Speakers: {", ".join(speaker_mapping.values())}
330
  Transcript: {transcript}
331
- Your task is to analyze speakers' discussions to identify compelling social media clips. For each speaker, identify key topics that mention people, news, events, trends, or sources.
332
 
333
- Format requirements:
 
 
 
 
 
 
 
334
 
335
- 1. SPEAKER FORMAT:
 
 
 
 
 
 
 
 
 
336
  **Speaker Name**
337
  1. [Topic title <div id='topic' style="display: inline"> 22s at 12:30 </div>]({{link_start}}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{750}}&et={{772}}&uid={{uid}})
338
  2. [Topic title <div id='topic' style="display: inline"> 43s at 14:45 </div>]({{link_start}}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{885}}&et={{928}}&uid={{uid}})
339
  3. [Topic title <div id='topic' style="display: inline"> 58s at 16:20 </div>]({{link_start}}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{980}}&et={{1038}}&uid={{uid}})
340
-
341
- 2. TIMESTAMP RULES:
342
- - Start time (st): Must begin exactly when speaker starts discussing the specific topic.
343
- - End time (et): Must end exactly when either:
344
- * The speaker completes their point, or
345
- * Before the next speaker begins.
346
- - NO OVERLAP: Selected duration must NEVER include dialogue from other speakers.
347
- - Duration limits: Minimum 20 seconds, maximum 1 minute 30 seconds.
348
- - Time format: "Xs at HH:MM" where X = seconds.
349
- - URL parameters: Convert display times to seconds.
350
- Example: "25s at 10:13" → st=613&et=638.
351
-
352
- 3. FORMATTING RULES:
353
- - Speaker names: Use markdown bold (**Name**).
354
- - Topic titles: First word capitalized, rest lowercase.
355
- - Each topic must be a clickable link with correct timestamp.
356
- - URL format: {{link_start}}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{start_time_in_sec}}&et={{end_time_in_sec}}&uid={{uid}}).
357
-
358
- 4. TOPIC SELECTION:
359
- - Prioritize engaging, viral-worthy content.
360
- - Minimum 2 topics per speaker, aim for 3 if available (SKIP THE HOST if no compelling content).
361
- - Topics should be self-contained discussions within the timestamp.
362
- - Skip speakers if fewer than 2 compelling topics found.
363
  """
364
- print(user_prompt)
365
 
366
  completion = client.chat.completions.create(
367
  model="gpt-4o",
@@ -479,7 +638,6 @@ If the user provides a link to the agenda, use the correct_speaker_name_with_url
479
  If the user provides the correct call type, use the correct_call_type function to correct the call type. Call Type for street interviews is 'si'.
480
  """
481
  messages = [{"role": "system", "content": prompt}]
482
- print(messages[0]["content"])
483
 
484
  for user_msg, assistant_msg in chat_history:
485
  if user_msg is not None: # Skip the initial message where user_msg is None
@@ -522,10 +680,14 @@ If the user provides the correct call type, use the correct_call_type function t
522
  corrected_speaker_mapping = (
523
  transcript_processor.speaker_mapping
524
  )
 
 
525
  function_call_result_message = {
526
  "role": "tool",
527
  "content": json.dumps(
528
- {"speaker_mapping": f"Corrected Speaker Mapping..."}
 
 
529
  ),
530
  "name": tool_call.function.name,
531
  "tool_call_id": tool_call.id,
@@ -534,13 +696,18 @@ If the user provides the correct call type, use the correct_call_type function t
534
 
535
  # Get final response after tool call
536
  final_response = client.chat.completions.create(
537
- model="gpt-4o", messages=messages, stream=True
 
 
538
  )
539
 
540
- # Stream the final response
541
  for final_chunk in final_response:
542
  if final_chunk.choices[0].delta.content:
543
- yield final_chunk.choices[0].delta.content
 
 
 
544
  return
545
  else:
546
  function_call_result_message = {
 
59
 
60
  def _process_transcript(self) -> None:
61
  results = self.transcript_data["results"]
62
+ current_words = []
63
+ current_speaker = None
64
+ current_start = None
65
+ current_items = []
66
+
67
+ for item in results["items"]:
68
+ if item["type"] == "pronunciation":
69
+ speaker = (
70
+ item.get("speaker_label", "").replace("spk_", "").replace("spk", "")
71
+ )
72
 
73
+ # Initialize on first pronunciation item
74
+ if current_speaker is None:
75
+ current_speaker = speaker
76
+ current_start = float(item["start_time"])
77
+
78
+ # Check for speaker change
79
+ if speaker != current_speaker:
80
+ if current_items:
81
+ self._create_segment(
82
+ current_speaker,
83
+ current_start,
84
+ float(item["start_time"]),
85
+ current_items,
86
+ )
87
+ current_items = []
88
+ current_words = []
89
+ current_speaker = speaker
90
+ current_start = float(item["start_time"])
91
+
92
+ current_items.append(item)
93
+ current_words.append(item["alternatives"][0]["content"])
94
+ elif item["type"] == "punctuation":
95
+ current_items.append(item)
96
+ # Only check for segment break if we're over 20 words
97
+ if len(current_words) >= 20:
98
+ # Break on this punctuation
99
+ next_item = next(
100
+ (
101
+ it
102
+ for it in results["items"][
103
+ results["items"].index(item) + 1 :
104
+ ]
105
+ if it["type"] == "pronunciation"
106
+ ),
107
+ None,
108
  )
109
+ if next_item:
110
+ self._create_segment(
111
+ current_speaker,
112
+ current_start,
113
+ float(next_item["start_time"]),
114
+ current_items,
115
+ )
116
+ current_items = []
117
+ current_words = []
118
+ current_start = float(next_item["start_time"])
119
+
120
+ # Don't forget the last segment
121
+ if current_items:
122
+ last_time = max(
123
+ float(item["end_time"])
124
+ for item in current_items
125
+ if item["type"] == "pronunciation"
126
+ )
127
+ self._create_segment(
128
+ current_speaker, current_start, last_time, current_items
129
+ )
130
 
131
  def _create_segment(
132
  self, speaker_id: str, start: float, end: float, items: list
133
  ) -> None:
134
+ segment_content = []
135
+ for item in items:
136
+ if item["type"] == "pronunciation":
137
+ segment_content.append(item["alternatives"][0]["content"])
138
+ elif item["type"] == "punctuation":
139
+ # Append punctuation to the last word without a space
140
+ if segment_content:
141
+ segment_content[-1] += item["alternatives"][0]["content"]
142
+
143
+ if segment_content:
 
144
  self.segments.append(
145
  TranscriptSegment(
146
  speaker_id=speaker_id,
147
  start_time=start,
148
  end_time=end,
149
+ text=" ".join(segment_content),
150
  )
151
  )
152
 
153
+ def correct_speaker_mapping_with_agenda(self, url: str) -> None:
154
+ """Fetch agenda from a URL and correct the speaker mapping using OpenAI."""
155
+ try:
156
+ if not url.startswith("http"):
157
+ # add https to the url
158
+ url = "https://" + url
159
+
160
+ response = requests.get(url)
161
+ response.raise_for_status()
162
+ html_content = response.text
163
+
164
+ # Parse the HTML to find the desired description
165
+ soup = BeautifulSoup(html_content, "html.parser")
166
+ description_tag = soup.find(
167
+ "script", {"type": "application/ld+json"}
168
+ ) # Find the ld+json metadata block
169
+ agenda = ""
170
+
171
+ if description_tag:
172
+ # Extract the JSON content
173
+ json_data = json.loads(description_tag.string)
174
+ if "description" in json_data:
175
+ agenda = json_data["description"]
176
+ else:
177
+ print("Agenda description not found in the JSON metadata.")
178
+ else:
179
+ print("No structured data (ld+json) found.")
180
+
181
+ if not agenda:
182
+ print("No agenda found in the structured metadata. Trying meta tags.")
183
+
184
+ # Fallback: Use meta description if ld+json doesn't have it
185
+ meta_description = soup.find("meta", {"name": "description"})
186
+ agenda = meta_description["content"] if meta_description else ""
187
+
188
+ if not agenda:
189
+ print("No agenda found in any description tags.")
190
+ return
191
+
192
+ print(self.speaker_mapping)
193
+
194
+ prompt = (
195
+ f"Given the original speaker mapping {self.speaker_mapping}, agenda:\n{agenda}, and the transcript: {self.formatted_transcript}\n\n"
196
+ "Some speaker names in the mapping might have spelling errors or be incomplete."
197
+ "Remember that the content in agenda is accurate and transcript can have errors so prioritize the spellings and names in the agenda content."
198
+ "If the speaker name and introduction is similar to the agenda, update the speaker name in the mapping."
199
+ "Please correct the names based on the agenda. Return the corrected mapping in JSON format as "
200
+ "{'spk_0': 'Correct Name', 'spk_1': 'Correct Name', ...}."
201
+ "You should only update the name if the name sounds very similar, or there is a good spelling overlap/ The Speaker Introduction matches the description of the Talk from Agends. If the name is totally unrelated, keep the original name."
202
+ "You should always include all the speakers in the mapping from the original mapping, even if you don't update their names. i.e if there are 4 speakers in original mapping, new mapping should have 4 speakers always, ignore all the other spekaers in the agenda. I REPEAT DO NOT ADD OTHER NEW SPEAKERS IN THE MAPPING."
203
+ )
204
+
205
+ client = OpenAI()
206
+
207
+ completion = client.chat.completions.create(
208
+ model="gpt-4o-mini",
209
+ messages=[
210
+ {"role": "system", "content": "You are a helpful assistant."},
211
+ {"role": "user", "content": prompt},
212
+ ],
213
+ temperature=0,
214
+ )
215
+
216
+ response_text = completion.choices[0].message.content.strip()
217
+ try:
218
+ corrected_mapping = json.loads(response_text)
219
+ except Exception:
220
+ response_text = response_text[
221
+ response_text.find("{") : response_text.rfind("}") + 1
222
+ ]
223
+ try:
224
+ corrected_mapping = json.loads(response_text)
225
+ except json.JSONDecodeError:
226
+ print(
227
+ "Error parsing corrected speaker mapping JSON, keeping the original mapping."
228
+ )
229
+ corrected_mapping = self.speaker_mapping
230
+ # Update the speaker mapping with corrected names
231
+ self.speaker_mapping = corrected_mapping
232
+ print("Corrected Speaker Mapping:", self.speaker_mapping)
233
+
234
+ # Update the transcript segments with corrected names
235
+ for segment in self.segments:
236
+ spk_id = f"spk_{segment.speaker_id}"
237
+ segment.speaker_name = self.speaker_mapping.get(spk_id, spk_id)
238
+
239
+ # Recreate the formatted transcript with corrected names
240
+ formatted_segments = []
241
+ for seg in self.segments:
242
+ start_time_str = self._format_time(seg.start_time)
243
+ end_time_str = self._format_time(seg.end_time)
244
+ formatted_segments.append(
245
+ f"time_stamp: {start_time_str}-{end_time_str}\n"
246
+ f"{seg.speaker_name}: {seg.text}\n"
247
+ )
248
+ self.formatted_transcript = "\n".join(formatted_segments)
249
+
250
+ except requests.exceptions.RequestException as e:
251
+ print(f" ching agenda from URL: {str(e)}")
252
+ except Exception as e:
253
+ print(f"Error correcting speaker mapping: {str(e)}")
254
+
255
  def _create_formatted_transcript(self) -> None:
256
  """Create formatted transcript with default speaker labels."""
257
  formatted_segments = []
 
462
  system_prompt = f"""You are analyzing a transcript for Call ID: {cid}, Session ID: {rsid}, Origin: {origin}, and Call Type: {ct}.
463
 
464
  CORE REQUIREMENTS:
465
+ 1. SPEAKER ISOLATION: When creating a clip, you must:
466
+ - Include ONLY continuous dialogue from ONE speaker
467
+ - The speaker must talk continuously without any interruptions
468
+ - As soon as another speaker starts talking, the clip MUST end
469
+ - Never combine dialogue across interruptions, even from the same speaker
470
+
471
+ 2. DURATION RULES:
472
+ - Each clip must be 20-100 seconds of CONTINUOUS speech
473
+ - If a speaker's dialogue is interrupted before 20 seconds, it cannot be used
474
+ - Clock starts when speaker begins and must end before any other speaker starts
475
+
476
+ 3. TOPIC COHERENCE:
477
+ - Each clip must cover one complete topic/thought
478
+ - Must end before speaker changes topics
479
+ - Content should be engaging and viral-worthy
480
+
481
+ 4. SPEAKER COVERAGE:
482
+ - Minimum 2 topics per speaker, aim for 3 if good content exists
483
+ - Host can be skipped if no compelling content
484
+
485
+ CRITICAL: When analyzing timestamps, you must verify that:
486
+ 1. No other speaker talks during the selected timeframe
487
+ 2. The speaker talks continuously for at least 20 seconds
488
+ 3. The clip ends BEFORE any interruption or speaker change
489
  """
490
 
491
  user_prompt = f"""Call Details:
 
493
  Call ID: {cid}
494
  Speakers: {", ".join(speaker_mapping.values())}
495
  Transcript: {transcript}
 
496
 
497
+ Your task is to create social media clips following these strict rules:
498
+
499
+ 1. TIMESTAMP SELECTION:
500
+ - You must check the transcript line by line
501
+ - Verify speaker continuity with NO interruptions
502
+ - End clips immediately before any other speaker starts
503
+ - If Speaker A talks from 1:00-1:10, then Speaker B talks, then Speaker A resumes at 1:15, these must be separate clips
504
+ - Never combine timestamps across interruptions
505
 
506
+ 2. CLIP REQUIREMENTS:
507
+ - Minimum 20 seconds of CONTINUOUS speech
508
+ - Maximum 100 seconds
509
+ - Single speaker only
510
+ - Must end before any interruption
511
+ - Complete thoughts/topics only
512
+
513
+
514
+ Return Format requirements:
515
+ SPEAKER FORMAT:
516
  **Speaker Name**
517
  1. [Topic title <div id='topic' style="display: inline"> 22s at 12:30 </div>]({{link_start}}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{750}}&et={{772}}&uid={{uid}})
518
  2. [Topic title <div id='topic' style="display: inline"> 43s at 14:45 </div>]({{link_start}}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{885}}&et={{928}}&uid={{uid}})
519
  3. [Topic title <div id='topic' style="display: inline"> 58s at 16:20 </div>]({{link_start}}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{980}}&et={{1038}}&uid={{uid}})
520
+ **Speaker Name**
521
+ ....
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
522
  """
523
+ print(user_prompt, speaker_mapping)
524
 
525
  completion = client.chat.completions.create(
526
  model="gpt-4o",
 
638
  If the user provides the correct call type, use the correct_call_type function to correct the call type. Call Type for street interviews is 'si'.
639
  """
640
  messages = [{"role": "system", "content": prompt}]
 
641
 
642
  for user_msg, assistant_msg in chat_history:
643
  if user_msg is not None: # Skip the initial message where user_msg is None
 
680
  corrected_speaker_mapping = (
681
  transcript_processor.speaker_mapping
682
  )
683
+ messages.append(response.choices[0].message)
684
+
685
  function_call_result_message = {
686
  "role": "tool",
687
  "content": json.dumps(
688
+ {
689
+ "speaker_mapping": f"Corrected Speaker Mapping... {corrected_speaker_mapping}"
690
+ }
691
  ),
692
  "name": tool_call.function.name,
693
  "tool_call_id": tool_call.id,
 
696
 
697
  # Get final response after tool call
698
  final_response = client.chat.completions.create(
699
+ model="gpt-4o",
700
+ messages=messages,
701
+ stream=True,
702
  )
703
 
704
+ collected_chunk = ""
705
  for final_chunk in final_response:
706
  if final_chunk.choices[0].delta.content:
707
+ collected_chunk += final_chunk.choices[
708
+ 0
709
+ ].delta.content
710
+ yield collected_chunk
711
  return
712
  else:
713
  function_call_result_message = {