File size: 17,983 Bytes
09425e4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 |
import json
import os
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
import gradio as gr
import requests
from bs4 import BeautifulSoup
from openai import OpenAI
street_interview = True
@dataclass
class TranscriptSegment:
speaker_id: str
start_time: float
end_time: float
text: str
speaker_name: str = ""
class TranscriptProcessor:
def __init__(self, transcript_file: str):
self.transcript_file = transcript_file
self.transcript_data = None
self.formatted_transcript = None
self.segments = []
self.text_windows = []
self.window_size = 2
self.speaker_mapping = {}
self._load_transcript()
self._process_transcript()
self.map_speaker_ids_to_names() # Map speaker IDs to names
def _load_transcript(self) -> None:
"""Load the transcript JSON file."""
with open(self.transcript_file, "r") as f:
self.transcript_data = json.load(f)
def _format_time(self, seconds: float) -> str:
"""Convert seconds to formatted time string (MM:SS)."""
minutes = int(seconds // 60)
seconds = int(seconds % 60)
return f"{minutes:02d}:{seconds:02d}"
def _process_transcript(self) -> None:
"""Process the transcript into segments with speaker information and create a formatted version with timestamps."""
results = self.transcript_data["results"]
# Process into segments
for segment in results["speaker_labels"]["segments"]:
speaker_id = segment.get("speaker_label", segment.get("speakerlabel", ""))
speaker_id = (
speaker_id.replace("spk_", "").replace("spk", "") if speaker_id else ""
)
start_time = float(segment.get("start_time", 0))
end_time = float(segment.get("end_time", 0))
items = [
item
for item in results["items"]
if "start_time" in item
and float(item["start_time"]) >= start_time
and float(item["start_time"]) < end_time
and item["type"] == "pronunciation"
]
words = [item["alternatives"][0]["content"] for item in items]
if words:
self.segments.append(
TranscriptSegment(
speaker_id=speaker_id,
start_time=start_time,
end_time=end_time,
text=" ".join(words),
)
)
formatted_segments = []
for seg in self.segments:
start_time_str = self._format_time(seg.start_time)
end_time_str = self._format_time(seg.end_time)
formatted_segments.append(
f"time_stamp: {start_time_str}-{end_time_str}\n"
f"spk {seg.speaker_id}: {seg.text}\n"
)
self.formatted_transcript = "\n".join(formatted_segments)
# Create sliding windows of text for better matching
for i in range(len(self.segments)):
# Combine current segment with next segments within window
window_segments = self.segments[i : i + self.window_size]
combined_text = " ".join(seg.text for seg in window_segments)
if window_segments:
self.text_windows.append(
{
"text": combined_text,
"start_time": window_segments[0].start_time,
"end_time": window_segments[-1].end_time,
}
)
def map_speaker_ids_to_names(self) -> None:
"""Map speaker IDs to names based on introductions in the transcript."""
try:
transcript = self.formatted_transcript
prompt = (
"Given the following transcript where speakers are identified as spk 0, spk 1, spk 2, etc., please map each spk ID to the speaker's name based on their introduction in the transcript. If no name is introduced for a speaker, keep it as spk_id. Return the mapping as a JSON object in the format {'spk_0': 'Speaker Name', 'spk_1': 'Speaker Name', ...}\n\n"
f"Transcript:\n{transcript}"
)
client = OpenAI()
completion = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt},
],
temperature=0,
)
response_text = completion.choices[0].message.content.strip()
try:
self.speaker_mapping = json.loads(response_text)
except json.JSONDecodeError:
# extract left most and right most {}
response_text = response_text[
response_text.find("{") : response_text.rfind("}") + 1
]
try:
self.speaker_mapping = json.loads(response_text)
except json.JSONDecodeError:
print("Error parsing speaker mapping JSON.")
self.speaker_mapping = {}
for segment in self.segments:
spk_id = f"spk_{segment.speaker_id}"
speaker_name = self.speaker_mapping.get(spk_id, spk_id)
segment.speaker_name = speaker_name # Store the speaker name
# Recreate the formatted transcript with speaker names
formatted_segments = []
for seg in self.segments:
start_time_str = self._format_time(seg.start_time)
end_time_str = self._format_time(seg.end_time)
formatted_segments.append(
f"time_stamp: {start_time_str}-{end_time_str}\n"
f"{seg.speaker_name}: {seg.text}\n"
)
self.formatted_transcript = "\n".join(formatted_segments)
except Exception as e:
print(f"Error mapping speaker IDs to names: {str(e)}")
self.speaker_mapping = {}
def correct_speaker_mapping_with_agenda(self, url: str) -> None:
"""Fetch agenda from a URL and correct the speaker mapping using OpenAI."""
try:
# Fetch the HTML content from the URL
response = requests.get(url)
response.raise_for_status()
html_content = response.text
# Parse the HTML to find the desired description
soup = BeautifulSoup(html_content, "html.parser")
description_tag = soup.find(
"script", {"type": "application/ld+json"}
) # Find the ld+json metadata block
agenda = ""
if description_tag:
# Extract the JSON content
json_data = json.loads(description_tag.string)
if "description" in json_data:
agenda = json_data["description"]
else:
print("Agenda description not found in the JSON metadata.")
else:
print("No structured data (ld+json) found.")
if not agenda:
print("No agenda found in the structured metadata. Trying meta tags.")
# Fallback: Use meta description if ld+json doesn't have it
meta_description = soup.find("meta", {"name": "description"})
agenda = meta_description["content"] if meta_description else ""
if not agenda:
print("No agenda found in any description tags.")
return
prompt = (
f"Given the speaker mapping {self.speaker_mapping}, agenda:\n{agenda}, and the transcript: {self.formatted_transcript}\n\n"
"Some speaker names in the mapping might have spelling errors or be incomplete."
"Please correct the names based on the agenda. Return the corrected mapping in JSON format as "
"{'spk_0': 'Correct Name', 'spk_1': 'Correct Name', ...}."
"You should only update the name if the name sounds very similar, or there is a good spelling overlap/ The Speaker Introduction matches the description of the Talk from Agends. If the name is totally unrelated, keep the original name."
)
# Use OpenAI API to get corrected mapping
client = OpenAI()
completion = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt},
],
temperature=0,
)
response_text = completion.choices[0].message.content.strip()
try:
corrected_mapping = json.loads(response_text)
except:
response_text = response_text[
response_text.find("{") : response_text.rfind("}") + 1
]
try:
corrected_mapping = json.loads(response_text)
except json.JSONDecodeError:
print(
"Error parsing corrected speaker mapping JSON, keeping the original mapping."
)
corrected_mapping = self.speaker_mapping
# Update the speaker mapping with corrected names
self.speaker_mapping = corrected_mapping
print("Corrected Speaker Mapping:", self.speaker_mapping)
# Update the transcript segments with corrected names
for segment in self.segments:
spk_id = f"spk_{segment.speaker_id}"
segment.speaker_name = self.speaker_mapping.get(spk_id, spk_id)
# Recreate the formatted transcript with corrected names
formatted_segments = []
for seg in self.segments:
start_time_str = self._format_time(seg.start_time)
end_time_str = self._format_time(seg.end_time)
formatted_segments.append(
f"time_stamp: {start_time_str}-{end_time_str}\n"
f"{seg.speaker_name}: {seg.text}\n"
)
self.formatted_transcript = "\n".join(formatted_segments)
except requests.exceptions.RequestException as e:
print(f"Error fetching agenda from URL: {str(e)}")
except Exception as e:
print(f"Error correcting speaker mapping: {str(e)}")
def get_transcript(self) -> str:
"""Return the formatted transcript with speaker names."""
return self.formatted_transcript
def get_transcript_data(self) -> Dict:
"""Return the raw transcript data."""
return self.transcript_data
def setup_openai_key() -> None:
"""Set up OpenAI API key from file."""
try:
with open("api.key", "r") as f:
os.environ["OPENAI_API_KEY"] = f.read().strip()
except FileNotFoundError:
raise FileNotFoundError(
"api.key file not found. Please create it with your OpenAI API key."
)
def get_initial_analysis(transcript_processor: TranscriptProcessor) -> str:
"""Perform initial analysis of the transcript using OpenAI."""
try:
transcript = transcript_processor.get_transcript()
# print("Transcript is: ", transcript)
client = OpenAI()
if street_interview:
prompt = f"""This is a transcript for a street interview. Transcript: {transcript}
In this street interview, the host asks multiple questions to the interviewees.
The interviewee can repeat a single answer multiple time to get the best take.
Your job is to find out the timestamp of the best answer given by the interviewee (Do not include the Question timestamp by interviwer in this). If there are multiple attempts for a question, best part is the last part of the question. If no question was asked but something is repeated, please include that in the answer as well
The way to know if there are multiple takes to a question is to see in the transcript if the same text is repeated, If not then number of takes is 1.
Question 1 should always be the introduction if the speaker has introduced themselves to find the best introduction time (Last timestamp is the best timestamp), Rest of questions should be in the order they were asked.
Return format is:
1. Question: question
Number of takes: number
Best Answer timestamp: start_time - end_time
You can visit the call segment on this URL: https://roll.ai/call_id/colab_id?starttime=start_time?endtime=end_time."
"""
else:
prompt = f"""Given the transcript {transcript}, For All the speakers, short list all people, news, events, trends, and source that are discussed by speakers along with the start time of that topic and end time of that topic from the transcript. Rank all topics based on what would make for the best social clips. I need atleast 3 topics per speaker.
You should mention the Speaker Name first, then 3 posts with their timestamps, and so on.
Return format is: Speaker Name\n1.Topic: topic, Start Time: start_time, End Time: end_time\n2...."""
print(prompt)
completion = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt},
],
)
return completion.choices[0].message.content
except Exception as e:
print(f"Error in initial analysis: {str(e)}")
return "An error occurred during initial analysis. Please check your API key and file path."
call_id = "20240226t210135"
colab_id = "1231412431212"
def generate_call_link(start_time: str) -> str:
"""Generate a link to the call at a specific timestamp."""
formatted_time = start_time.replace(":", ".")
return f"https://roll.ai/{call_id}/{colab_id}?t={formatted_time}"
def chat(
message: str, chat_history: List, transcript_processor: TranscriptProcessor
) -> str:
try:
client = OpenAI()
# if street_interview:
# prompt = f"""You are a helpful assistant analyzing transcripts and generating timestamps and URL. Call ID is {call_id} and Colab ID is {colab_id}.
# Transcript: {transcript_processor.get_transcript()}
# If a user asks t
# """
# else:
prompt = f"""You are a helpful assistant analyzing transcripts and generating timestamps and URL. Call ID is {call_id} and Colab ID is {colab_id}.
Transcript: {transcript_processor.get_transcript()}
If a user asks timestamps for a specific topic, find the start time and end time of that specific topic and return answer in the format: 'Timestamp: start_time - end_time'.
You can visit the call segment on this URL: https://roll.ai/call_id/colab_id?starttime=start_time?endtime=end_time."
If a user requests a link to a specific segment topic, generate a link to that segment using the following format: https://roll.ai/call_id/colab_id?starttime=start_time?endtime=end_time."""
messages = [{"role": "system", "content": prompt}]
for user_msg, assistant_msg in chat_history:
if user_msg is not None: # Skip the initial message where user_msg is None
messages.append({"role": "user", "content": user_msg})
if assistant_msg is not None:
messages.append({"role": "assistant", "content": assistant_msg})
# Add the current message
messages.append({"role": "user", "content": message})
completion = client.chat.completions.create(
model="gpt-4o-mini",
messages=messages,
)
response = completion.choices[0].message
return response.content
except Exception as e:
print(f"Unexpected error in chat: {str(e)}")
import traceback
print(f"Traceback: {traceback.format_exc()}")
return "Sorry, there was an error processing your request."
def create_chat_interface(transcript_processor: TranscriptProcessor):
"""Create and configure the chat interface."""
def respond(message: str, chat_history: List) -> Tuple[str, List]:
if not message:
return "", chat_history
bot_message = chat(message, chat_history, transcript_processor)
new_history = list(chat_history)
new_history.append((message, bot_message))
return "", new_history
with gr.Blocks() as demo:
chatbot = gr.Chatbot()
msg = gr.Textbox()
clear = gr.ClearButton([msg, chatbot])
# Initialize with transcript analysis
initial_analysis = get_initial_analysis(transcript_processor)
def init_chat():
return [(None, initial_analysis)]
chatbot.value = init_chat()
msg.submit(respond, [msg, chatbot], [msg, chatbot])
return demo
def main():
"""Main function to run the application."""
try:
setup_openai_key()
current_dir = os.path.dirname(os.path.abspath(__file__))
transcript_file = os.path.join(current_dir, "step_take19AWS.json")
if not os.path.exists(transcript_file):
raise FileNotFoundError(
"Transcript file not found. Please check the file path."
)
transcript_processor = TranscriptProcessor(transcript_file)
transcript_processor.correct_speaker_mapping_with_agenda(
"https://lu.ma/STEPSF24"
)
demo = create_chat_interface(transcript_processor)
demo.launch(share=True)
except Exception as e:
print(f"Error starting application: {str(e)}")
raise
if __name__ == "__main__":
main()
|