Spaces:
Running
Running
Commit
·
eb33652
1
Parent(s):
fecd16e
update: transcript processor
Browse files
app.py
CHANGED
@@ -59,51 +59,199 @@ class TranscriptProcessor:
|
|
59 |
|
60 |
def _process_transcript(self) -> None:
|
61 |
results = self.transcript_data["results"]
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
|
|
|
|
70 |
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
)
|
80 |
-
|
81 |
-
|
82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
def _create_segment(
|
85 |
self, speaker_id: str, start: float, end: float, items: list
|
86 |
) -> None:
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
if words:
|
98 |
self.segments.append(
|
99 |
TranscriptSegment(
|
100 |
speaker_id=speaker_id,
|
101 |
start_time=start,
|
102 |
end_time=end,
|
103 |
-
text=" ".join(
|
104 |
)
|
105 |
)
|
106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
def _create_formatted_transcript(self) -> None:
|
108 |
"""Create formatted transcript with default speaker labels."""
|
109 |
formatted_segments = []
|
@@ -314,13 +462,30 @@ Total takes: 2
|
|
314 |
system_prompt = f"""You are analyzing a transcript for Call ID: {cid}, Session ID: {rsid}, Origin: {origin}, and Call Type: {ct}.
|
315 |
|
316 |
CORE REQUIREMENTS:
|
317 |
-
1.
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
324 |
"""
|
325 |
|
326 |
user_prompt = f"""Call Details:
|
@@ -328,40 +493,34 @@ User ID: {uid}
|
|
328 |
Call ID: {cid}
|
329 |
Speakers: {", ".join(speaker_mapping.values())}
|
330 |
Transcript: {transcript}
|
331 |
-
Your task is to analyze speakers' discussions to identify compelling social media clips. For each speaker, identify key topics that mention people, news, events, trends, or sources.
|
332 |
|
333 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
334 |
|
335 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
336 |
**Speaker Name**
|
337 |
1. [Topic title <div id='topic' style="display: inline"> 22s at 12:30 </div>]({{link_start}}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{750}}&et={{772}}&uid={{uid}})
|
338 |
2. [Topic title <div id='topic' style="display: inline"> 43s at 14:45 </div>]({{link_start}}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{885}}&et={{928}}&uid={{uid}})
|
339 |
3. [Topic title <div id='topic' style="display: inline"> 58s at 16:20 </div>]({{link_start}}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{980}}&et={{1038}}&uid={{uid}})
|
340 |
-
|
341 |
-
|
342 |
-
- Start time (st): Must begin exactly when speaker starts discussing the specific topic.
|
343 |
-
- End time (et): Must end exactly when either:
|
344 |
-
* The speaker completes their point, or
|
345 |
-
* Before the next speaker begins.
|
346 |
-
- NO OVERLAP: Selected duration must NEVER include dialogue from other speakers.
|
347 |
-
- Duration limits: Minimum 20 seconds, maximum 1 minute 30 seconds.
|
348 |
-
- Time format: "Xs at HH:MM" where X = seconds.
|
349 |
-
- URL parameters: Convert display times to seconds.
|
350 |
-
Example: "25s at 10:13" → st=613&et=638.
|
351 |
-
|
352 |
-
3. FORMATTING RULES:
|
353 |
-
- Speaker names: Use markdown bold (**Name**).
|
354 |
-
- Topic titles: First word capitalized, rest lowercase.
|
355 |
-
- Each topic must be a clickable link with correct timestamp.
|
356 |
-
- URL format: {{link_start}}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{start_time_in_sec}}&et={{end_time_in_sec}}&uid={{uid}}).
|
357 |
-
|
358 |
-
4. TOPIC SELECTION:
|
359 |
-
- Prioritize engaging, viral-worthy content.
|
360 |
-
- Minimum 2 topics per speaker, aim for 3 if available (SKIP THE HOST if no compelling content).
|
361 |
-
- Topics should be self-contained discussions within the timestamp.
|
362 |
-
- Skip speakers if fewer than 2 compelling topics found.
|
363 |
"""
|
364 |
-
print(user_prompt)
|
365 |
|
366 |
completion = client.chat.completions.create(
|
367 |
model="gpt-4o",
|
@@ -479,7 +638,6 @@ If the user provides a link to the agenda, use the correct_speaker_name_with_url
|
|
479 |
If the user provides the correct call type, use the correct_call_type function to correct the call type. Call Type for street interviews is 'si'.
|
480 |
"""
|
481 |
messages = [{"role": "system", "content": prompt}]
|
482 |
-
print(messages[0]["content"])
|
483 |
|
484 |
for user_msg, assistant_msg in chat_history:
|
485 |
if user_msg is not None: # Skip the initial message where user_msg is None
|
@@ -522,10 +680,14 @@ If the user provides the correct call type, use the correct_call_type function t
|
|
522 |
corrected_speaker_mapping = (
|
523 |
transcript_processor.speaker_mapping
|
524 |
)
|
|
|
|
|
525 |
function_call_result_message = {
|
526 |
"role": "tool",
|
527 |
"content": json.dumps(
|
528 |
-
{
|
|
|
|
|
529 |
),
|
530 |
"name": tool_call.function.name,
|
531 |
"tool_call_id": tool_call.id,
|
@@ -534,13 +696,18 @@ If the user provides the correct call type, use the correct_call_type function t
|
|
534 |
|
535 |
# Get final response after tool call
|
536 |
final_response = client.chat.completions.create(
|
537 |
-
model="gpt-4o",
|
|
|
|
|
538 |
)
|
539 |
|
540 |
-
|
541 |
for final_chunk in final_response:
|
542 |
if final_chunk.choices[0].delta.content:
|
543 |
-
|
|
|
|
|
|
|
544 |
return
|
545 |
else:
|
546 |
function_call_result_message = {
|
|
|
59 |
|
60 |
def _process_transcript(self) -> None:
|
61 |
results = self.transcript_data["results"]
|
62 |
+
current_words = []
|
63 |
+
current_speaker = None
|
64 |
+
current_start = None
|
65 |
+
current_items = []
|
66 |
+
|
67 |
+
for item in results["items"]:
|
68 |
+
if item["type"] == "pronunciation":
|
69 |
+
speaker = (
|
70 |
+
item.get("speaker_label", "").replace("spk_", "").replace("spk", "")
|
71 |
+
)
|
72 |
|
73 |
+
# Initialize on first pronunciation item
|
74 |
+
if current_speaker is None:
|
75 |
+
current_speaker = speaker
|
76 |
+
current_start = float(item["start_time"])
|
77 |
+
|
78 |
+
# Check for speaker change
|
79 |
+
if speaker != current_speaker:
|
80 |
+
if current_items:
|
81 |
+
self._create_segment(
|
82 |
+
current_speaker,
|
83 |
+
current_start,
|
84 |
+
float(item["start_time"]),
|
85 |
+
current_items,
|
86 |
+
)
|
87 |
+
current_items = []
|
88 |
+
current_words = []
|
89 |
+
current_speaker = speaker
|
90 |
+
current_start = float(item["start_time"])
|
91 |
+
|
92 |
+
current_items.append(item)
|
93 |
+
current_words.append(item["alternatives"][0]["content"])
|
94 |
+
elif item["type"] == "punctuation":
|
95 |
+
current_items.append(item)
|
96 |
+
# Only check for segment break if we're over 20 words
|
97 |
+
if len(current_words) >= 20:
|
98 |
+
# Break on this punctuation
|
99 |
+
next_item = next(
|
100 |
+
(
|
101 |
+
it
|
102 |
+
for it in results["items"][
|
103 |
+
results["items"].index(item) + 1 :
|
104 |
+
]
|
105 |
+
if it["type"] == "pronunciation"
|
106 |
+
),
|
107 |
+
None,
|
108 |
)
|
109 |
+
if next_item:
|
110 |
+
self._create_segment(
|
111 |
+
current_speaker,
|
112 |
+
current_start,
|
113 |
+
float(next_item["start_time"]),
|
114 |
+
current_items,
|
115 |
+
)
|
116 |
+
current_items = []
|
117 |
+
current_words = []
|
118 |
+
current_start = float(next_item["start_time"])
|
119 |
+
|
120 |
+
# Don't forget the last segment
|
121 |
+
if current_items:
|
122 |
+
last_time = max(
|
123 |
+
float(item["end_time"])
|
124 |
+
for item in current_items
|
125 |
+
if item["type"] == "pronunciation"
|
126 |
+
)
|
127 |
+
self._create_segment(
|
128 |
+
current_speaker, current_start, last_time, current_items
|
129 |
+
)
|
130 |
|
131 |
def _create_segment(
|
132 |
self, speaker_id: str, start: float, end: float, items: list
|
133 |
) -> None:
|
134 |
+
segment_content = []
|
135 |
+
for item in items:
|
136 |
+
if item["type"] == "pronunciation":
|
137 |
+
segment_content.append(item["alternatives"][0]["content"])
|
138 |
+
elif item["type"] == "punctuation":
|
139 |
+
# Append punctuation to the last word without a space
|
140 |
+
if segment_content:
|
141 |
+
segment_content[-1] += item["alternatives"][0]["content"]
|
142 |
+
|
143 |
+
if segment_content:
|
|
|
144 |
self.segments.append(
|
145 |
TranscriptSegment(
|
146 |
speaker_id=speaker_id,
|
147 |
start_time=start,
|
148 |
end_time=end,
|
149 |
+
text=" ".join(segment_content),
|
150 |
)
|
151 |
)
|
152 |
|
153 |
+
def correct_speaker_mapping_with_agenda(self, url: str) -> None:
|
154 |
+
"""Fetch agenda from a URL and correct the speaker mapping using OpenAI."""
|
155 |
+
try:
|
156 |
+
if not url.startswith("http"):
|
157 |
+
# add https to the url
|
158 |
+
url = "https://" + url
|
159 |
+
|
160 |
+
response = requests.get(url)
|
161 |
+
response.raise_for_status()
|
162 |
+
html_content = response.text
|
163 |
+
|
164 |
+
# Parse the HTML to find the desired description
|
165 |
+
soup = BeautifulSoup(html_content, "html.parser")
|
166 |
+
description_tag = soup.find(
|
167 |
+
"script", {"type": "application/ld+json"}
|
168 |
+
) # Find the ld+json metadata block
|
169 |
+
agenda = ""
|
170 |
+
|
171 |
+
if description_tag:
|
172 |
+
# Extract the JSON content
|
173 |
+
json_data = json.loads(description_tag.string)
|
174 |
+
if "description" in json_data:
|
175 |
+
agenda = json_data["description"]
|
176 |
+
else:
|
177 |
+
print("Agenda description not found in the JSON metadata.")
|
178 |
+
else:
|
179 |
+
print("No structured data (ld+json) found.")
|
180 |
+
|
181 |
+
if not agenda:
|
182 |
+
print("No agenda found in the structured metadata. Trying meta tags.")
|
183 |
+
|
184 |
+
# Fallback: Use meta description if ld+json doesn't have it
|
185 |
+
meta_description = soup.find("meta", {"name": "description"})
|
186 |
+
agenda = meta_description["content"] if meta_description else ""
|
187 |
+
|
188 |
+
if not agenda:
|
189 |
+
print("No agenda found in any description tags.")
|
190 |
+
return
|
191 |
+
|
192 |
+
print(self.speaker_mapping)
|
193 |
+
|
194 |
+
prompt = (
|
195 |
+
f"Given the original speaker mapping {self.speaker_mapping}, agenda:\n{agenda}, and the transcript: {self.formatted_transcript}\n\n"
|
196 |
+
"Some speaker names in the mapping might have spelling errors or be incomplete."
|
197 |
+
"Remember that the content in agenda is accurate and transcript can have errors so prioritize the spellings and names in the agenda content."
|
198 |
+
"If the speaker name and introduction is similar to the agenda, update the speaker name in the mapping."
|
199 |
+
"Please correct the names based on the agenda. Return the corrected mapping in JSON format as "
|
200 |
+
"{'spk_0': 'Correct Name', 'spk_1': 'Correct Name', ...}."
|
201 |
+
"You should only update the name if the name sounds very similar, or there is a good spelling overlap/ The Speaker Introduction matches the description of the Talk from Agends. If the name is totally unrelated, keep the original name."
|
202 |
+
"You should always include all the speakers in the mapping from the original mapping, even if you don't update their names. i.e if there are 4 speakers in original mapping, new mapping should have 4 speakers always, ignore all the other spekaers in the agenda. I REPEAT DO NOT ADD OTHER NEW SPEAKERS IN THE MAPPING."
|
203 |
+
)
|
204 |
+
|
205 |
+
client = OpenAI()
|
206 |
+
|
207 |
+
completion = client.chat.completions.create(
|
208 |
+
model="gpt-4o-mini",
|
209 |
+
messages=[
|
210 |
+
{"role": "system", "content": "You are a helpful assistant."},
|
211 |
+
{"role": "user", "content": prompt},
|
212 |
+
],
|
213 |
+
temperature=0,
|
214 |
+
)
|
215 |
+
|
216 |
+
response_text = completion.choices[0].message.content.strip()
|
217 |
+
try:
|
218 |
+
corrected_mapping = json.loads(response_text)
|
219 |
+
except Exception:
|
220 |
+
response_text = response_text[
|
221 |
+
response_text.find("{") : response_text.rfind("}") + 1
|
222 |
+
]
|
223 |
+
try:
|
224 |
+
corrected_mapping = json.loads(response_text)
|
225 |
+
except json.JSONDecodeError:
|
226 |
+
print(
|
227 |
+
"Error parsing corrected speaker mapping JSON, keeping the original mapping."
|
228 |
+
)
|
229 |
+
corrected_mapping = self.speaker_mapping
|
230 |
+
# Update the speaker mapping with corrected names
|
231 |
+
self.speaker_mapping = corrected_mapping
|
232 |
+
print("Corrected Speaker Mapping:", self.speaker_mapping)
|
233 |
+
|
234 |
+
# Update the transcript segments with corrected names
|
235 |
+
for segment in self.segments:
|
236 |
+
spk_id = f"spk_{segment.speaker_id}"
|
237 |
+
segment.speaker_name = self.speaker_mapping.get(spk_id, spk_id)
|
238 |
+
|
239 |
+
# Recreate the formatted transcript with corrected names
|
240 |
+
formatted_segments = []
|
241 |
+
for seg in self.segments:
|
242 |
+
start_time_str = self._format_time(seg.start_time)
|
243 |
+
end_time_str = self._format_time(seg.end_time)
|
244 |
+
formatted_segments.append(
|
245 |
+
f"time_stamp: {start_time_str}-{end_time_str}\n"
|
246 |
+
f"{seg.speaker_name}: {seg.text}\n"
|
247 |
+
)
|
248 |
+
self.formatted_transcript = "\n".join(formatted_segments)
|
249 |
+
|
250 |
+
except requests.exceptions.RequestException as e:
|
251 |
+
print(f" ching agenda from URL: {str(e)}")
|
252 |
+
except Exception as e:
|
253 |
+
print(f"Error correcting speaker mapping: {str(e)}")
|
254 |
+
|
255 |
def _create_formatted_transcript(self) -> None:
|
256 |
"""Create formatted transcript with default speaker labels."""
|
257 |
formatted_segments = []
|
|
|
462 |
system_prompt = f"""You are analyzing a transcript for Call ID: {cid}, Session ID: {rsid}, Origin: {origin}, and Call Type: {ct}.
|
463 |
|
464 |
CORE REQUIREMENTS:
|
465 |
+
1. SPEAKER ISOLATION: When creating a clip, you must:
|
466 |
+
- Include ONLY continuous dialogue from ONE speaker
|
467 |
+
- The speaker must talk continuously without any interruptions
|
468 |
+
- As soon as another speaker starts talking, the clip MUST end
|
469 |
+
- Never combine dialogue across interruptions, even from the same speaker
|
470 |
+
|
471 |
+
2. DURATION RULES:
|
472 |
+
- Each clip must be 20-100 seconds of CONTINUOUS speech
|
473 |
+
- If a speaker's dialogue is interrupted before 20 seconds, it cannot be used
|
474 |
+
- Clock starts when speaker begins and must end before any other speaker starts
|
475 |
+
|
476 |
+
3. TOPIC COHERENCE:
|
477 |
+
- Each clip must cover one complete topic/thought
|
478 |
+
- Must end before speaker changes topics
|
479 |
+
- Content should be engaging and viral-worthy
|
480 |
+
|
481 |
+
4. SPEAKER COVERAGE:
|
482 |
+
- Minimum 2 topics per speaker, aim for 3 if good content exists
|
483 |
+
- Host can be skipped if no compelling content
|
484 |
+
|
485 |
+
CRITICAL: When analyzing timestamps, you must verify that:
|
486 |
+
1. No other speaker talks during the selected timeframe
|
487 |
+
2. The speaker talks continuously for at least 20 seconds
|
488 |
+
3. The clip ends BEFORE any interruption or speaker change
|
489 |
"""
|
490 |
|
491 |
user_prompt = f"""Call Details:
|
|
|
493 |
Call ID: {cid}
|
494 |
Speakers: {", ".join(speaker_mapping.values())}
|
495 |
Transcript: {transcript}
|
|
|
496 |
|
497 |
+
Your task is to create social media clips following these strict rules:
|
498 |
+
|
499 |
+
1. TIMESTAMP SELECTION:
|
500 |
+
- You must check the transcript line by line
|
501 |
+
- Verify speaker continuity with NO interruptions
|
502 |
+
- End clips immediately before any other speaker starts
|
503 |
+
- If Speaker A talks from 1:00-1:10, then Speaker B talks, then Speaker A resumes at 1:15, these must be separate clips
|
504 |
+
- Never combine timestamps across interruptions
|
505 |
|
506 |
+
2. CLIP REQUIREMENTS:
|
507 |
+
- Minimum 20 seconds of CONTINUOUS speech
|
508 |
+
- Maximum 100 seconds
|
509 |
+
- Single speaker only
|
510 |
+
- Must end before any interruption
|
511 |
+
- Complete thoughts/topics only
|
512 |
+
|
513 |
+
|
514 |
+
Return Format requirements:
|
515 |
+
SPEAKER FORMAT:
|
516 |
**Speaker Name**
|
517 |
1. [Topic title <div id='topic' style="display: inline"> 22s at 12:30 </div>]({{link_start}}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{750}}&et={{772}}&uid={{uid}})
|
518 |
2. [Topic title <div id='topic' style="display: inline"> 43s at 14:45 </div>]({{link_start}}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{885}}&et={{928}}&uid={{uid}})
|
519 |
3. [Topic title <div id='topic' style="display: inline"> 58s at 16:20 </div>]({{link_start}}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{980}}&et={{1038}}&uid={{uid}})
|
520 |
+
**Speaker Name**
|
521 |
+
....
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
522 |
"""
|
523 |
+
print(user_prompt, speaker_mapping)
|
524 |
|
525 |
completion = client.chat.completions.create(
|
526 |
model="gpt-4o",
|
|
|
638 |
If the user provides the correct call type, use the correct_call_type function to correct the call type. Call Type for street interviews is 'si'.
|
639 |
"""
|
640 |
messages = [{"role": "system", "content": prompt}]
|
|
|
641 |
|
642 |
for user_msg, assistant_msg in chat_history:
|
643 |
if user_msg is not None: # Skip the initial message where user_msg is None
|
|
|
680 |
corrected_speaker_mapping = (
|
681 |
transcript_processor.speaker_mapping
|
682 |
)
|
683 |
+
messages.append(response.choices[0].message)
|
684 |
+
|
685 |
function_call_result_message = {
|
686 |
"role": "tool",
|
687 |
"content": json.dumps(
|
688 |
+
{
|
689 |
+
"speaker_mapping": f"Corrected Speaker Mapping... {corrected_speaker_mapping}"
|
690 |
+
}
|
691 |
),
|
692 |
"name": tool_call.function.name,
|
693 |
"tool_call_id": tool_call.id,
|
|
|
696 |
|
697 |
# Get final response after tool call
|
698 |
final_response = client.chat.completions.create(
|
699 |
+
model="gpt-4o",
|
700 |
+
messages=messages,
|
701 |
+
stream=True,
|
702 |
)
|
703 |
|
704 |
+
collected_chunk = ""
|
705 |
for final_chunk in final_response:
|
706 |
if final_chunk.choices[0].delta.content:
|
707 |
+
collected_chunk += final_chunk.choices[
|
708 |
+
0
|
709 |
+
].delta.content
|
710 |
+
yield collected_chunk
|
711 |
return
|
712 |
else:
|
713 |
function_call_result_message = {
|