Spaces:

RollAI
/

ChatWithTranscriptStaging

Running

App Files Files Community

AhmadMustafa commited on Jan 3

Commit

8498900

1 Parent(s): 37287c3

update: SI prompt and Transcript Processor

Browse files

Files changed (1) hide show

app.py +103 -174

app.py CHANGED Viewed

@@ -19,14 +19,19 @@ class TranscriptSegment:
 class TranscriptProcessor:
-    def __init__(self, transcript_file: str = None, transcript_data: dict = None):
         self.transcript_file = transcript_file
         self.transcript_data = transcript_data
         self.formatted_transcript = None
         self.segments = []
-        self.text_windows = []
-        self.window_size = 2
         self.speaker_mapping = {}
         if self.transcript_file:
             self._load_transcript()
         elif self.transcript_data:
@@ -37,7 +42,9 @@ class TranscriptProcessor:
             )
         self._process_transcript()
-        self.map_speaker_ids_to_names()
     def _load_transcript(self) -> None:
         """Load the transcript JSON file."""
@@ -51,10 +58,7 @@ class TranscriptProcessor:
         return f"{minutes:02d}:{seconds:02d}"
     def _process_transcript(self) -> None:
-        """Process the transcript into segments with speaker information and create a formatted version with timestamps."""
         results = self.transcript_data["results"]
-        # Process into segments
         for segment in results["speaker_labels"]["segments"]:
             speaker_id = segment.get("speaker_label", segment.get("speakerlabel", ""))
             speaker_id = (
@@ -64,55 +68,59 @@ class TranscriptProcessor:
             start_time = float(segment.get("start_time", 0))
             end_time = float(segment.get("end_time", 0))
-            items = [
-                item
-                for item in results["items"]
-                if "start_time" in item
-                and float(item["start_time"]) >= start_time
-                and float(item["start_time"]) < end_time
-                and item["type"] == "pronunciation"
-            ]
-            words = [item["alternatives"][0]["content"] for item in items]
-            if words:
-                self.segments.append(
-                    TranscriptSegment(
-                        speaker_id=speaker_id,
-                        start_time=start_time,
-                        end_time=end_time,
-                        text=" ".join(words),
                     )
                 )
         formatted_segments = []
         for seg in self.segments:
             start_time_str = self._format_time(seg.start_time)
             end_time_str = self._format_time(seg.end_time)
             formatted_segments.append(
                 f"time_stamp: {start_time_str}-{end_time_str}\n"
-                f"spk {seg.speaker_id}: {seg.text}\n"
             )
         self.formatted_transcript = "\n".join(formatted_segments)
-        # Create sliding windows of text for better matching
-        for i in range(len(self.segments)):
-            # Combine current segment with next segments within window
-            window_segments = self.segments[i : i + self.window_size]
-            combined_text = " ".join(seg.text for seg in window_segments)
-            if window_segments:
-                self.text_windows.append(
-                    {
-                        "text": combined_text,
-                        "start_time": window_segments[0].start_time,
-                        "end_time": window_segments[-1].end_time,
-                    }
-                )
     def map_speaker_ids_to_names(self) -> None:
         """Map speaker IDs to names based on introductions in the transcript."""
         try:
             transcript = self.formatted_transcript
             prompt = (
@@ -135,7 +143,6 @@ class TranscriptProcessor:
             try:
                 self.speaker_mapping = json.loads(response_text)
             except json.JSONDecodeError:
-                # extract left most and right most {}
                 response_text = response_text[
                     response_text.find("{") : response_text.rfind("}") + 1
                 ]
@@ -144,124 +151,31 @@ class TranscriptProcessor:
                 except json.JSONDecodeError:
                     print("Error parsing speaker mapping JSON.")
                     self.speaker_mapping = {}
             for segment in self.segments:
                 spk_id = f"spk_{segment.speaker_id}"
                 speaker_name = self.speaker_mapping.get(spk_id, spk_id)
                 segment.speaker_name = speaker_name
-            # Recreate the formatted transcript with speaker names
-            formatted_segments = []
-            for seg in self.segments:
-                start_time_str = self._format_time(seg.start_time)
-                end_time_str = self._format_time(seg.end_time)
-                formatted_segments.append(
-                    f"time_stamp: {start_time_str}-{end_time_str}\n"
-                    f"{seg.speaker_name}: {seg.text}\n"
-                )
-            self.formatted_transcript = "\n".join(formatted_segments)
         except Exception as e:
             print(f"Error mapping speaker IDs to names: {str(e)}")
             self.speaker_mapping = {}
-    def correct_speaker_mapping_with_agenda(self, url: str) -> None:
-        """Fetch agenda from a URL and correct the speaker mapping using OpenAI."""
-        try:
-            response = requests.get(url)
-            response.raise_for_status()
-            html_content = response.text
-            # Parse the HTML to find the desired description
-            soup = BeautifulSoup(html_content, "html.parser")
-            description_tag = soup.find(
-                "script", {"type": "application/ld+json"}
-            )  # Find the ld+json metadata block
-            agenda = ""
-            if description_tag:
-                # Extract the JSON content
-                json_data = json.loads(description_tag.string)
-                if "description" in json_data:
-                    agenda = json_data["description"]
-                else:
-                    print("Agenda description not found in the JSON metadata.")
-            else:
-                print("No structured data (ld+json) found.")
-            if not agenda:
-                print("No agenda found in the structured metadata. Trying meta tags.")
-                # Fallback: Use meta description if ld+json doesn't have it
-                meta_description = soup.find("meta", {"name": "description"})
-                agenda = meta_description["content"] if meta_description else ""
-            if not agenda:
-                print("No agenda found in any description tags.")
-                return
-            print(self.speaker_mapping)
-            prompt = (
-                f"Given the original speaker mapping {self.speaker_mapping}, agenda:\n{agenda}, and the transcript: {self.formatted_transcript}\n\n"
-                "Some speaker names in the mapping might have spelling errors or be incomplete."
-                "Remember that the content in agenda is accurate and transcript can have errors so prioritize the spellings and names in the agenda content."
-                "If the speaker name and introduction is similar to the agenda, update the speaker name in the mapping."
-                "Please correct the names based on the agenda. Return the corrected mapping in JSON format as "
-                "{'spk_0': 'Correct Name', 'spk_1': 'Correct Name', ...}."
-                "You should only update the name if the name sounds very similar, or there is a good spelling overlap/ The Speaker Introduction matches the description of the Talk from Agends. If the name is totally unrelated, keep the original name."
-                "You should always include all the speakers in the mapping from the original mapping, even if you don't update their names. i.e if there are 4 speakers in original mapping, new mapping should have 4 speakers always, ignore all the other spekaers in the agenda. I REPEAT DO NOT ADD OTHER NEW SPEAKERS IN THE MAPPING."
-            )
-            client = OpenAI()
-            completion = client.chat.completions.create(
-                model="gpt-4o-mini",
-                messages=[
-                    {"role": "system", "content": "You are a helpful assistant."},
-                    {"role": "user", "content": prompt},
-                ],
-                temperature=0,
             )
-            response_text = completion.choices[0].message.content.strip()
-            try:
-                corrected_mapping = json.loads(response_text)
-            except Exception:
-                response_text = response_text[
-                    response_text.find("{") : response_text.rfind("}") + 1
-                ]
-                try:
-                    corrected_mapping = json.loads(response_text)
-                except json.JSONDecodeError:
-                    print(
-                        "Error parsing corrected speaker mapping JSON, keeping the original mapping."
-                    )
-                    corrected_mapping = self.speaker_mapping
-            # Update the speaker mapping with corrected names
-            self.speaker_mapping = corrected_mapping
-            print("Corrected Speaker Mapping:", self.speaker_mapping)
-            # Update the transcript segments with corrected names
-            for segment in self.segments:
-                spk_id = f"spk_{segment.speaker_id}"
-                segment.speaker_name = self.speaker_mapping.get(spk_id, spk_id)
-            # Recreate the formatted transcript with corrected names
-            formatted_segments = []
-            for seg in self.segments:
-                start_time_str = self._format_time(seg.start_time)
-                end_time_str = self._format_time(seg.end_time)
-                formatted_segments.append(
-                    f"time_stamp: {start_time_str}-{end_time_str}\n"
-                    f"{seg.speaker_name}: {seg.text}\n"
-                )
-            self.formatted_transcript = "\n".join(formatted_segments)
-        except requests.exceptions.RequestException as e:
-            print(f"  ching agenda from URL: {str(e)}")
-        except Exception as e:
-            print(f"Error correcting speaker mapping: {str(e)}")
     def get_transcript(self) -> str:
         """Return the formatted transcript with speaker names."""
@@ -333,7 +247,6 @@ def get_initial_analysis(
             link_start = "http"
         else:
             link_start = "https"
-        print(uid)
         if ct == "si":  # street interview
             prompt = f"""This is a transcript for a street interview. Call Details are as follows:
 User ID UID: {uid}
@@ -381,6 +294,22 @@ Total takes: 2
 - [Take 1. <div id='topic' style="display: inline"> 10s at 09:45]({link_start}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{585}}&et={{595}}&uid={{uid}})
 - [Take 1. <div id='topic' style="display: inline"> 20s at 25:45]({link_start}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{1245}}&et={{1265}}&uid={{uid}}))
 - [Take 3 (Best). <div id='topic' style="display: inline"> 5s at 10:13 </div>]({link_start}://roll.ai/colab/1234aq_12314/51234151?st=613&et=618&uid=82314)"""
         else:
             prompt = f"""Call Details:
 User ID: {uid}
@@ -420,28 +349,27 @@ Format requirements:
 - Topics should be self-contained discussions within the timestamp
 - Skip speakers if fewer than 2 compelling topics found
 """
-            print(prompt)
-        completion = client.chat.completions.create(
-            model="gpt-4o-mini",
-            messages=[
-                {
-                    "role": "system",
-                    "content": f"""You are analyzing a transcript for Call ID: {cid}, Session ID: {rsid}, Origin: {origin}, Call Type: {ct}.
-CORE REQUIREMENTS:
-1. TIMESTAMPS: Each clip must contain ONLY the specified speaker's dialogue about a single topic. No overlapping dialogue from other speakers. YOU NEED TO BE VERY CAREFUL ABOUT THIS RULE. YOU HAVE THE TRANSCRIPT AND YOU CAN SEE WHO IS SPEAKING AT WHAT TIME SO BE VERY VERY CARAEFUL AND ONLY INCLUDE THE DIALOGUE OF THE SPEAKER YOU ARE MAKING THE CLIP FOR.
-2. DURATION: Clips should be between 20-60 seconds long.
-3. CONTENT: Select engaging, viral-worthy topics. Avoid mundane or irrelevant content
-4. COVERAGE: Minimum 2 topics per speaker, aim for 3 if good content exists.
-5. YOU CAN IGNORE THE HOST IF NO COMPELLING CONTENT IS FOUND.
-YOU SHOULD Prioritize accuracy in timestamp at every cost.""",
-                },
-                {"role": "user", "content": prompt},
-            ],
-            stream=True,
-            temperature=0.5,
-        )
         collected_messages = []
         # Iterate through the stream
@@ -815,7 +743,6 @@ def create_chat_interface():
             ct = request.query_params.get("ct", None)
             turl = request.query_params.get("turl", None)
             uid = request.query_params.get("uid", None)
-            print(uid)
             required_params = ["cid", "rsid", "origin", "ct", "turl", "uid"]
             missing_params = [
@@ -834,7 +761,9 @@ def create_chat_interface():
             try:
                 transcript_data = get_transcript_for_url(turl)
                 transcript_processor = TranscriptProcessor(
-                    transcript_data=transcript_data
                 )
                 # Initialize with empty message

 class TranscriptProcessor:
+    def __init__(
+        self,
+        transcript_file: str = None,
+        transcript_data: dict = None,
+        max_segment_duration: int = None,
+        call_type: str = "le",
+    ):
         self.transcript_file = transcript_file
         self.transcript_data = transcript_data
         self.formatted_transcript = None
         self.segments = []
         self.speaker_mapping = {}
+        self.max_segment_duration = max_segment_duration
         if self.transcript_file:
             self._load_transcript()
         elif self.transcript_data:
             )
         self._process_transcript()
+        self._create_formatted_transcript()  # Create initial formatted transcript
+        if call_type != "si":
+            self.map_speaker_ids_to_names()
     def _load_transcript(self) -> None:
         """Load the transcript JSON file."""
         return f"{minutes:02d}:{seconds:02d}"
     def _process_transcript(self) -> None:
         results = self.transcript_data["results"]
         for segment in results["speaker_labels"]["segments"]:
             speaker_id = segment.get("speaker_label", segment.get("speakerlabel", ""))
             speaker_id = (
             start_time = float(segment.get("start_time", 0))
             end_time = float(segment.get("end_time", 0))
+            if self.max_segment_duration:
+                current_start = start_time
+                while current_start < end_time:
+                    current_end = min(
+                        current_start + self.max_segment_duration, end_time
                     )
+                    self._create_segment(
+                        speaker_id, current_start, current_end, results["items"]
+                    )
+                    current_start = current_end
+            else:
+                self._create_segment(speaker_id, start_time, end_time, results["items"])
+    def _create_segment(
+        self, speaker_id: str, start: float, end: float, items: list
+    ) -> None:
+        matching_items = [
+            item
+            for item in items
+            if "start_time" in item
+            and float(item["start_time"]) >= start
+            and float(item["start_time"]) < end
+            and item["type"] == "pronunciation"
+        ]
+        words = [item["alternatives"][0]["content"] for item in matching_items]
+        if words:
+            self.segments.append(
+                TranscriptSegment(
+                    speaker_id=speaker_id,
+                    start_time=start,
+                    end_time=end,
+                    text=" ".join(words),
                 )
+            )
+    def _create_formatted_transcript(self) -> None:
+        """Create formatted transcript with default speaker labels."""
         formatted_segments = []
         for seg in self.segments:
             start_time_str = self._format_time(seg.start_time)
             end_time_str = self._format_time(seg.end_time)
+            # Use default speaker label (spk_X) if no mapping exists
+            speaker_label = f"spk_{seg.speaker_id}"
             formatted_segments.append(
                 f"time_stamp: {start_time_str}-{end_time_str}\n"
+                f"{speaker_label}: {seg.text}\n"
             )
         self.formatted_transcript = "\n".join(formatted_segments)
     def map_speaker_ids_to_names(self) -> None:
         """Map speaker IDs to names based on introductions in the transcript."""
         try:
             transcript = self.formatted_transcript
             prompt = (
             try:
                 self.speaker_mapping = json.loads(response_text)
             except json.JSONDecodeError:
                 response_text = response_text[
                     response_text.find("{") : response_text.rfind("}") + 1
                 ]
                 except json.JSONDecodeError:
                     print("Error parsing speaker mapping JSON.")
                     self.speaker_mapping = {}
+            # Update segments with speaker names and recreate formatted transcript
             for segment in self.segments:
                 spk_id = f"spk_{segment.speaker_id}"
                 speaker_name = self.speaker_mapping.get(spk_id, spk_id)
                 segment.speaker_name = speaker_name
+            self._create_formatted_transcript_with_names()
         except Exception as e:
             print(f"Error mapping speaker IDs to names: {str(e)}")
             self.speaker_mapping = {}
+    def _create_formatted_transcript_with_names(self) -> None:
+        """Create formatted transcript with mapped speaker names."""
+        formatted_segments = []
+        for seg in self.segments:
+            start_time_str = self._format_time(seg.start_time)
+            end_time_str = self._format_time(seg.end_time)
+            speaker_name = getattr(seg, "speaker_name", f"spk_{seg.speaker_id}")
+            formatted_segments.append(
+                f"time_stamp: {start_time_str}-{end_time_str}\n"
+                f"{speaker_name}: {seg.text}\n"
             )
+        self.formatted_transcript = "\n".join(formatted_segments)
     def get_transcript(self) -> str:
         """Return the formatted transcript with speaker names."""
             link_start = "http"
         else:
             link_start = "https"
         if ct == "si":  # street interview
             prompt = f"""This is a transcript for a street interview. Call Details are as follows:
 User ID UID: {uid}
 - [Take 1. <div id='topic' style="display: inline"> 10s at 09:45]({link_start}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{585}}&et={{595}}&uid={{uid}})
 - [Take 1. <div id='topic' style="display: inline"> 20s at 25:45]({link_start}://{{origin}}/collab/{{cid}}/{{rsid}}?st={{1245}}&et={{1265}}&uid={{uid}}))
 - [Take 3 (Best). <div id='topic' style="display: inline"> 5s at 10:13 </div>]({link_start}://roll.ai/colab/1234aq_12314/51234151?st=613&et=618&uid=82314)"""
+            completion = client.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=[
+                    {
+                        "role": "system",
+                        "content": f"""You are analyzing a transcript for Call ID: {cid}, Session ID: {rsid}, Origin: {origin}, Call Type: {ct}.
+    CORE REQUIREMENT:
+- TIMESTAMPS: A speaker can repeat the answer to a question multiple times. You need to pick the last answer very carefully and choose that as best take. Make sure that that same answer is not repeated again after the best answer.
+    YOU SHOULD Prioritize accuracy in timestamp at every cost. Read the Transcript carefully and decide where an answer starts and ends. You will have speaker labels so you need to be very sharp.""",
+                    },
+                    {"role": "user", "content": prompt},
+                ],
+                stream=True,
+                temperature=0.5,
+            )
         else:
             prompt = f"""Call Details:
 User ID: {uid}
 - Topics should be self-contained discussions within the timestamp
 - Skip speakers if fewer than 2 compelling topics found
 """
+            completion = client.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=[
+                    {
+                        "role": "system",
+                        "content": f"""You are analyzing a transcript for Call ID: {cid}, Session ID: {rsid}, Origin: {origin}, Call Type: {ct}.
+    CORE REQUIREMENTS:
+    1. TIMESTAMPS: Each clip must contain ONLY the specified speaker's dialogue about a single topic. No overlapping dialogue from other speakers. YOU NEED TO BE VERY CAREFUL ABOUT THIS RULE. YOU HAVE THE TRANSCRIPT AND YOU CAN SEE WHO IS SPEAKING AT WHAT TIME SO BE VERY VERY CARAEFUL AND ONLY INCLUDE THE DIALOGUE OF THE SPEAKER YOU ARE MAKING THE CLIP FOR.
+    2. DURATION: Clips should be between 20-60 seconds long.
+    3. CONTENT: Select engaging, viral-worthy topics. Avoid mundane or irrelevant content
+    4. COVERAGE: Minimum 2 topics per speaker, aim for 3 if good content exists.
+    5. YOU CAN IGNORE THE HOST IF NO COMPELLING CONTENT IS FOUND.
+    YOU SHOULD Prioritize accuracy in timestamp at every cost.""",
+                    },
+                    {"role": "user", "content": prompt},
+                ],
+                stream=True,
+                temperature=0.5,
+            )
         collected_messages = []
         # Iterate through the stream
             ct = request.query_params.get("ct", None)
             turl = request.query_params.get("turl", None)
             uid = request.query_params.get("uid", None)
             required_params = ["cid", "rsid", "origin", "ct", "turl", "uid"]
             missing_params = [
             try:
                 transcript_data = get_transcript_for_url(turl)
                 transcript_processor = TranscriptProcessor(
+                    transcript_data=transcript_data,
+                    max_segment_duration=10,
+                    call_type=ct,
                 )
                 # Initialize with empty message