Spaces:

RollAI
/

ChatWithTranscriptDev

Running

App Files Files Community

AhmadMustafa commited on Jan 7

Commit

eb33652

1 Parent(s): fecd16e

update: transcript processor

Browse files

Files changed (1) hide show

app.py +237 -70

app.py CHANGED Viewed

@@ -59,51 +59,199 @@ class TranscriptProcessor:
     def _process_transcript(self) -> None:
         results = self.transcript_data["results"]
-        for segment in results["speaker_labels"]["segments"]:
-            speaker_id = segment.get("speaker_label", segment.get("speakerlabel", ""))
-            speaker_id = (
-                speaker_id.replace("spk_", "").replace("spk", "") if speaker_id else ""
-            )
-            start_time = float(segment.get("start_time", 0))
-            end_time = float(segment.get("end_time", 0))
-            if self.max_segment_duration:
-                current_start = start_time
-                while current_start < end_time:
-                    current_end = min(
-                        current_start + self.max_segment_duration, end_time
-                    )
-                    self._create_segment(
-                        speaker_id, current_start, current_end, results["items"]
                     )
-                    current_start = current_end
-            else:
-                self._create_segment(speaker_id, start_time, end_time, results["items"])
     def _create_segment(
         self, speaker_id: str, start: float, end: float, items: list
     ) -> None:
-        matching_items = [
-            item
-            for item in items
-            if "start_time" in item
-            and float(item["start_time"]) >= start
-            and float(item["start_time"]) < end
-            and item["type"] == "pronunciation"
-        ]
-        words = [item["alternatives"][0]["content"] for item in matching_items]
-        if words:
             self.segments.append(
                 TranscriptSegment(
                     speaker_id=speaker_id,
                     start_time=start,
                     end_time=end,
-                    text=" ".join(words),
                 )
             )
     def _create_formatted_transcript(self) -> None:
         """Create formatted transcript with default speaker labels."""
         formatted_segments = []
@@ -314,13 +462,30 @@ Total takes: 2
             system_prompt = f"""You are analyzing a transcript for Call ID: {cid}, Session ID: {rsid}, Origin: {origin}, and Call Type: {ct}.
 CORE REQUIREMENTS:
-1. TIMESTAMPS: Each clip must contain ONLY the specified speaker's dialogue about a single topic. No overlapping dialogue from other speakers. YOU NEED TO BE VERY CAREFUL ABOUT THIS RULE. YOU HAVE THE TRANSCRIPT AND YOU CAN SEE WHO IS SPEAKING AT WHAT TIME, SO BE VERY, VERY CAREFUL AND ONLY INCLUDE THE DIALOGUE OF THE SPEAKER YOU ARE MAKING THE CLIP FOR.
-2. DURATION: Clips should be between 20-90 seconds long.
-3. CONTENT: Select engaging, viral-worthy topics. Avoid mundane or irrelevant content.
-4. COVERAGE: Minimum 2 topics per speaker, aim for 3 if good content exists.
-5. YOU CAN IGNORE THE HOST IF NO COMPELLING CONTENT IS FOUND.
-YOU SHOULD prioritize accuracy in timestamps at all costs.
 """
             user_prompt = f"""Call Details:
@@ -328,40 +493,34 @@ User ID: {uid}
 Call ID: {cid}
 Speakers: {", ".join(speaker_mapping.values())}
 Transcript: {transcript}
-Your task is to analyze speakers' discussions to identify compelling social media clips. For each speaker, identify key topics that mention people, news, events, trends, or sources.
-Format requirements:
-1. SPEAKER FORMAT:
 **Speaker Name**
 1. [Topic title <div id='topic' style="display: inline"> 22s at 12:30 </div>]({{link_start}}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{750}}&et={{772}}&uid={{uid}})
 2. [Topic title <div id='topic' style="display: inline"> 43s at 14:45 </div>]({{link_start}}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{885}}&et={{928}}&uid={{uid}})
 3. [Topic title <div id='topic' style="display: inline"> 58s at 16:20 </div>]({{link_start}}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{980}}&et={{1038}}&uid={{uid}})
-2. TIMESTAMP RULES:
-- Start time (st): Must begin exactly when speaker starts discussing the specific topic.
-- End time (et): Must end exactly when either:
-  * The speaker completes their point, or
-  * Before the next speaker begins.
-- NO OVERLAP: Selected duration must NEVER include dialogue from other speakers.
-- Duration limits: Minimum 20 seconds, maximum 1 minute 30 seconds.
-- Time format: "Xs at HH:MM" where X = seconds.
-- URL parameters: Convert display times to seconds.
-  Example: "25s at 10:13" → st=613&et=638.
-3. FORMATTING RULES:
-- Speaker names: Use markdown bold (**Name**).
-- Topic titles: First word capitalized, rest lowercase.
-- Each topic must be a clickable link with correct timestamp.
-- URL format: {{link_start}}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{start_time_in_sec}}&et={{end_time_in_sec}}&uid={{uid}}).
-4. TOPIC SELECTION:
-- Prioritize engaging, viral-worthy content.
-- Minimum 2 topics per speaker, aim for 3 if available (SKIP THE HOST if no compelling content).
-- Topics should be self-contained discussions within the timestamp.
-- Skip speakers if fewer than 2 compelling topics found.
 """
-            print(user_prompt)
             completion = client.chat.completions.create(
                 model="gpt-4o",
@@ -479,7 +638,6 @@ If the user provides a link to the agenda, use the correct_speaker_name_with_url
 If the user provides the correct call type, use the correct_call_type function to correct the call type. Call Type for street interviews is 'si'.
 """
         messages = [{"role": "system", "content": prompt}]
-        print(messages[0]["content"])
         for user_msg, assistant_msg in chat_history:
             if user_msg is not None:  # Skip the initial message where user_msg is None
@@ -522,10 +680,14 @@ If the user provides the correct call type, use the correct_call_type function t
                             corrected_speaker_mapping = (
                                 transcript_processor.speaker_mapping
                             )
                             function_call_result_message = {
                                 "role": "tool",
                                 "content": json.dumps(
-                                    {"speaker_mapping": f"Corrected Speaker Mapping..."}
                                 ),
                                 "name": tool_call.function.name,
                                 "tool_call_id": tool_call.id,
@@ -534,13 +696,18 @@ If the user provides the correct call type, use the correct_call_type function t
                             # Get final response after tool call
                             final_response = client.chat.completions.create(
-                                model="gpt-4o", messages=messages, stream=True
                             )
-                            # Stream the final response
                             for final_chunk in final_response:
                                 if final_chunk.choices[0].delta.content:
-                                    yield final_chunk.choices[0].delta.content
                             return
                         else:
                             function_call_result_message = {

     def _process_transcript(self) -> None:
         results = self.transcript_data["results"]
+        current_words = []
+        current_speaker = None
+        current_start = None
+        current_items = []
+        for item in results["items"]:
+            if item["type"] == "pronunciation":
+                speaker = (
+                    item.get("speaker_label", "").replace("spk_", "").replace("spk", "")
+                )
+                # Initialize on first pronunciation item
+                if current_speaker is None:
+                    current_speaker = speaker
+                    current_start = float(item["start_time"])
+                # Check for speaker change
+                if speaker != current_speaker:
+                    if current_items:
+                        self._create_segment(
+                            current_speaker,
+                            current_start,
+                            float(item["start_time"]),
+                            current_items,
+                        )
+                    current_items = []
+                    current_words = []
+                    current_speaker = speaker
+                    current_start = float(item["start_time"])
+                current_items.append(item)
+                current_words.append(item["alternatives"][0]["content"])
+            elif item["type"] == "punctuation":
+                current_items.append(item)
+                # Only check for segment break if we're over 20 words
+                if len(current_words) >= 20:
+                    # Break on this punctuation
+                    next_item = next(
+                        (
+                            it
+                            for it in results["items"][
+                                results["items"].index(item) + 1 :
+                            ]
+                            if it["type"] == "pronunciation"
+                        ),
+                        None,
                     )
+                    if next_item:
+                        self._create_segment(
+                            current_speaker,
+                            current_start,
+                            float(next_item["start_time"]),
+                            current_items,
+                        )
+                        current_items = []
+                        current_words = []
+                        current_start = float(next_item["start_time"])
+        # Don't forget the last segment
+        if current_items:
+            last_time = max(
+                float(item["end_time"])
+                for item in current_items
+                if item["type"] == "pronunciation"
+            )
+            self._create_segment(
+                current_speaker, current_start, last_time, current_items
+            )
     def _create_segment(
         self, speaker_id: str, start: float, end: float, items: list
     ) -> None:
+        segment_content = []
+        for item in items:
+            if item["type"] == "pronunciation":
+                segment_content.append(item["alternatives"][0]["content"])
+            elif item["type"] == "punctuation":
+                # Append punctuation to the last word without a space
+                if segment_content:
+                    segment_content[-1] += item["alternatives"][0]["content"]
+        if segment_content:
             self.segments.append(
                 TranscriptSegment(
                     speaker_id=speaker_id,
                     start_time=start,
                     end_time=end,
+                    text=" ".join(segment_content),
                 )
             )
+    def correct_speaker_mapping_with_agenda(self, url: str) -> None:
+        """Fetch agenda from a URL and correct the speaker mapping using OpenAI."""
+        try:
+            if not url.startswith("http"):
+                # add https to the url
+                url = "https://" + url
+            response = requests.get(url)
+            response.raise_for_status()
+            html_content = response.text
+            # Parse the HTML to find the desired description
+            soup = BeautifulSoup(html_content, "html.parser")
+            description_tag = soup.find(
+                "script", {"type": "application/ld+json"}
+            )  # Find the ld+json metadata block
+            agenda = ""
+            if description_tag:
+                # Extract the JSON content
+                json_data = json.loads(description_tag.string)
+                if "description" in json_data:
+                    agenda = json_data["description"]
+                else:
+                    print("Agenda description not found in the JSON metadata.")
+            else:
+                print("No structured data (ld+json) found.")
+            if not agenda:
+                print("No agenda found in the structured metadata. Trying meta tags.")
+                # Fallback: Use meta description if ld+json doesn't have it
+                meta_description = soup.find("meta", {"name": "description"})
+                agenda = meta_description["content"] if meta_description else ""
+            if not agenda:
+                print("No agenda found in any description tags.")
+                return
+            print(self.speaker_mapping)
+            prompt = (
+                f"Given the original speaker mapping {self.speaker_mapping}, agenda:\n{agenda}, and the transcript: {self.formatted_transcript}\n\n"
+                "Some speaker names in the mapping might have spelling errors or be incomplete."
+                "Remember that the content in agenda is accurate and transcript can have errors so prioritize the spellings and names in the agenda content."
+                "If the speaker name and introduction is similar to the agenda, update the speaker name in the mapping."
+                "Please correct the names based on the agenda. Return the corrected mapping in JSON format as "
+                "{'spk_0': 'Correct Name', 'spk_1': 'Correct Name', ...}."
+                "You should only update the name if the name sounds very similar, or there is a good spelling overlap/ The Speaker Introduction matches the description of the Talk from Agends. If the name is totally unrelated, keep the original name."
+                "You should always include all the speakers in the mapping from the original mapping, even if you don't update their names. i.e if there are 4 speakers in original mapping, new mapping should have 4 speakers always, ignore all the other spekaers in the agenda. I REPEAT DO NOT ADD OTHER NEW SPEAKERS IN THE MAPPING."
+            )
+            client = OpenAI()
+            completion = client.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=[
+                    {"role": "system", "content": "You are a helpful assistant."},
+                    {"role": "user", "content": prompt},
+                ],
+                temperature=0,
+            )
+            response_text = completion.choices[0].message.content.strip()
+            try:
+                corrected_mapping = json.loads(response_text)
+            except Exception:
+                response_text = response_text[
+                    response_text.find("{") : response_text.rfind("}") + 1
+                ]
+                try:
+                    corrected_mapping = json.loads(response_text)
+                except json.JSONDecodeError:
+                    print(
+                        "Error parsing corrected speaker mapping JSON, keeping the original mapping."
+                    )
+                    corrected_mapping = self.speaker_mapping
+            # Update the speaker mapping with corrected names
+            self.speaker_mapping = corrected_mapping
+            print("Corrected Speaker Mapping:", self.speaker_mapping)
+            # Update the transcript segments with corrected names
+            for segment in self.segments:
+                spk_id = f"spk_{segment.speaker_id}"
+                segment.speaker_name = self.speaker_mapping.get(spk_id, spk_id)
+            # Recreate the formatted transcript with corrected names
+            formatted_segments = []
+            for seg in self.segments:
+                start_time_str = self._format_time(seg.start_time)
+                end_time_str = self._format_time(seg.end_time)
+                formatted_segments.append(
+                    f"time_stamp: {start_time_str}-{end_time_str}\n"
+                    f"{seg.speaker_name}: {seg.text}\n"
+                )
+            self.formatted_transcript = "\n".join(formatted_segments)
+        except requests.exceptions.RequestException as e:
+            print(f"  ching agenda from URL: {str(e)}")
+        except Exception as e:
+            print(f"Error correcting speaker mapping: {str(e)}")
     def _create_formatted_transcript(self) -> None:
         """Create formatted transcript with default speaker labels."""
         formatted_segments = []
             system_prompt = f"""You are analyzing a transcript for Call ID: {cid}, Session ID: {rsid}, Origin: {origin}, and Call Type: {ct}.
 CORE REQUIREMENTS:
+1. SPEAKER ISOLATION: When creating a clip, you must:
+   - Include ONLY continuous dialogue from ONE speaker
+   - The speaker must talk continuously without any interruptions
+   - As soon as another speaker starts talking, the clip MUST end
+   - Never combine dialogue across interruptions, even from the same speaker
+2. DURATION RULES:
+   - Each clip must be 20-100 seconds of CONTINUOUS speech
+   - If a speaker's dialogue is interrupted before 20 seconds, it cannot be used
+   - Clock starts when speaker begins and must end before any other speaker starts
+3. TOPIC COHERENCE:
+   - Each clip must cover one complete topic/thought
+   - Must end before speaker changes topics
+   - Content should be engaging and viral-worthy
+4. SPEAKER COVERAGE:
+   - Minimum 2 topics per speaker, aim for 3 if good content exists
+   - Host can be skipped if no compelling content
+CRITICAL: When analyzing timestamps, you must verify that:
+1. No other speaker talks during the selected timeframe
+2. The speaker talks continuously for at least 20 seconds
+3. The clip ends BEFORE any interruption or speaker change
 """
             user_prompt = f"""Call Details:
 Call ID: {cid}
 Speakers: {", ".join(speaker_mapping.values())}
 Transcript: {transcript}
+Your task is to create social media clips following these strict rules:
+1. TIMESTAMP SELECTION:
+- You must check the transcript line by line
+- Verify speaker continuity with NO interruptions
+- End clips immediately before any other speaker starts
+- If Speaker A talks from 1:00-1:10, then Speaker B talks, then Speaker A resumes at 1:15, these must be separate clips
+- Never combine timestamps across interruptions
+2. CLIP REQUIREMENTS:
+- Minimum 20 seconds of CONTINUOUS speech
+- Maximum 100 seconds
+- Single speaker only
+- Must end before any interruption
+- Complete thoughts/topics only
+Return Format requirements:
+SPEAKER FORMAT:
 **Speaker Name**
 1. [Topic title <div id='topic' style="display: inline"> 22s at 12:30 </div>]({{link_start}}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{750}}&et={{772}}&uid={{uid}})
 2. [Topic title <div id='topic' style="display: inline"> 43s at 14:45 </div>]({{link_start}}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{885}}&et={{928}}&uid={{uid}})
 3. [Topic title <div id='topic' style="display: inline"> 58s at 16:20 </div>]({{link_start}}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{980}}&et={{1038}}&uid={{uid}})
+**Speaker Name**
+....
 """
+            print(user_prompt, speaker_mapping)
             completion = client.chat.completions.create(
                 model="gpt-4o",
 If the user provides the correct call type, use the correct_call_type function to correct the call type. Call Type for street interviews is 'si'.
 """
         messages = [{"role": "system", "content": prompt}]
         for user_msg, assistant_msg in chat_history:
             if user_msg is not None:  # Skip the initial message where user_msg is None
                             corrected_speaker_mapping = (
                                 transcript_processor.speaker_mapping
                             )
+                            messages.append(response.choices[0].message)
                             function_call_result_message = {
                                 "role": "tool",
                                 "content": json.dumps(
+                                    {
+                                        "speaker_mapping": f"Corrected Speaker Mapping... {corrected_speaker_mapping}"
+                                    }
                                 ),
                                 "name": tool_call.function.name,
                                 "tool_call_id": tool_call.id,
                             # Get final response after tool call
                             final_response = client.chat.completions.create(
+                                model="gpt-4o",
+                                messages=messages,
+                                stream=True,
                             )
+                            collected_chunk = ""
                             for final_chunk in final_response:
                                 if final_chunk.choices[0].delta.content:
+                                    collected_chunk += final_chunk.choices[
+                                        0
+                                    ].delta.content
+                                    yield collected_chunk
                             return
                         else:
                             function_call_result_message = {