ChatWithTranscript / broll_generator.py
AhmadMustafa's picture
b roll
f098be9
import json
import os
import re
from typing import Dict, List, Tuple, Union
import requests
from openai import OpenAI
def extract_clips_from_analysis(analysis_text: str) -> List[Dict]:
"""
Extract social media clips from the initial analysis output
Args:
analysis_text: The formatted analysis text from get_initial_analysis
Returns:
List of clip dictionaries with title, start_time, and end_time
"""
print(f"Starting extract_clips_from_analysis with analysis_text length: {len(analysis_text)}")
clips = []
# Pattern to match clip links with timestamps
# Example: [Introduction and Event Overview <div id='topic' style="display: inline"> 40s at 03:25 </div>]
pattern = r"\[([^<]+)<div[^>]*>\s*(\d+)s\s+at\s+(\d{2}):(\d{2})\s*</div>\]"
matches = re.findall(pattern, analysis_text)
print(f"Found {len(matches)} matches in analysis text")
for match in matches:
title = match[0].strip()
duration = int(match[1])
minutes = int(match[2])
seconds = int(match[3])
start_time = minutes * 60 + seconds
end_time = start_time + duration
clip = {
"clip_title": title,
"start_time": start_time,
"end_time": end_time,
"duration": duration,
}
clips.append(clip)
print(f"Extracted clip: {title} ({start_time}-{end_time}s)")
print(f"Total clips extracted: {len(clips)}")
return clips
def extract_transcript_content(
transcript_data: List, start_time: float, end_time: float
) -> str:
"""
Extract transcript content between start and end times
Args:
transcript_data: List of transcript segments (TranscriptSegment objects or dicts)
start_time: Start time in seconds
end_time: End time in seconds
Returns:
Extracted transcript text
"""
print(f"Extracting transcript content for {start_time}-{end_time}s from {len(transcript_data)} segments")
content = []
for segment in transcript_data:
# Handle both TranscriptSegment objects and dictionary formats
if hasattr(segment, "start_time") and hasattr(segment, "end_time"):
# TranscriptSegment object
segment_start = segment.start_time
segment_end = segment.end_time
segment_text = segment.text
elif hasattr(segment, "get"):
# Dictionary format
segment_start = segment.get("start_time", segment.get("start", 0))
segment_end = segment.get("end_time", segment.get("end", 0))
segment_text = segment.get("text", "")
else:
# Handle other object types with direct attribute access
segment_start = getattr(segment, "start_time", getattr(segment, "start", 0))
segment_end = getattr(segment, "end_time", getattr(segment, "end", 0))
segment_text = getattr(segment, "text", "")
# Check if segment overlaps with our time range
if segment_start <= end_time and segment_end >= start_time:
content.append(segment_text)
result = " ".join(content).strip()
print(f"Extracted {len(content)} segments, total text length: {len(result)}")
return result
def generate_broll_queries(
client: OpenAI, transcript_content: str, clip_data: Dict
) -> List[Dict]:
"""
Generate B-roll search queries using OpenAI based on transcript content and clip data
Args:
client: OpenAI client
transcript_content: Transcript text for the clip timeframe
clip_data: Social media clip data with timestamps
Returns:
List of query dictionaries with timestamps
"""
duration = clip_data.get("end_time", 0) - clip_data.get("start_time", 0)
print(f"Generating B-roll queries for clip: {clip_data.get('clip_title', 'Unknown')}")
prompt = f"""
Analyze this transcript content from a social media clip and generate appropriate B-roll search queries.
Clip Title: {clip_data.get('clip_title', 'Unknown')}
Start Time: {clip_data.get('start_time', 0)} seconds
End Time: {clip_data.get('end_time', 0)} seconds
Duration: {duration} seconds
Transcript Content:
{transcript_content}
Generate 3-5 specific search queries that would find relevant B-roll images for this content.
For each query, specify the exact timestamp within the clip where it would be most relevant.
Focus on:
- Key people, places, or concepts mentioned
- Visual metaphors or illustrations
- Current events or topics discussed
- Products, companies, or brands mentioned
Return a JSON array with this structure:
[
{{
"query": "specific search query for Google Images",
"timestamp_in_clip": 5.2,
"relevance_reason": "why this image is relevant at this moment"
}}
]
Ensure timestamps are between 0 and {duration} seconds.
Make queries specific and descriptive for better image search results.
"""
try:
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "system",
"content": "You are an expert video editor specializing in finding relevant B-roll content for social media clips. Generate specific, searchable queries that will find compelling visual content.",
},
{"role": "user", "content": prompt},
],
temperature=0.3,
)
response_text = response.choices[0].message.content
# Extract JSON from response
if "```json" in response_text and "```" in response_text.split("```json", 1)[1]:
json_text = response_text.split("```json", 1)[1].split("```", 1)[0]
queries = json.loads(json_text)
else:
queries = json.loads(response_text)
print(f"Generated {len(queries)} B-roll queries")
return queries
except Exception as e:
print(f"Error generating B-roll queries: {str(e)}")
return []
def search_google_images(
query: str, api_key: str, search_engine_id: str, num_results: int = 3
) -> List[Dict]:
"""
Search Google Images using Custom Search API
Args:
query: Search query string
api_key: Google API key
search_engine_id: Google Custom Search Engine ID
num_results: Number of results to return
Returns:
List of image result dictionaries
"""
try:
url = "https://www.googleapis.com/customsearch/v1"
params = {
"key": api_key,
"cx": search_engine_id,
"q": query,
"searchType": "image",
"num": num_results,
"safe": "active",
"imgSize": "large",
"imgType": "photo",
}
response = requests.get(url, params=params)
response.raise_for_status()
data = response.json()
results = []
for item in data.get("items", []):
result = {
"title": item.get("title", ""),
"image_url": item.get("link", ""),
"thumbnail_url": item.get("image", {}).get("thumbnailLink", ""),
"context_url": item.get("image", {}).get("contextLink", ""),
"width": item.get("image", {}).get("width", 0),
"height": item.get("image", {}).get("height", 0),
"file_size": item.get("image", {}).get("byteSize", 0),
}
results.append(result)
return results
except Exception as e:
print(f"Error searching Google Images for query '{query}': {str(e)}")
return []
def process_broll_generation(
transcript_data: List,
analysis_text: str,
google_api_key: str = None,
search_engine_id: str = None,
) -> List[Dict]:
"""
Main processing function to generate B-roll content for social media clips
Args:
transcript_data: Full transcript data from TranscriptProcessor (list of TranscriptSegment objects or dicts)
analysis_text: The formatted analysis output from get_initial_analysis
google_api_key: Google API key for image search
search_engine_id: Google Custom Search Engine ID
Returns:
List of processed clips with B-roll suggestions
"""
try:
print("Starting B-roll generation process")
print(f"Transcript data type: {type(transcript_data)}, length: {len(transcript_data) if transcript_data else 0}")
print(f"Analysis text length: {len(analysis_text) if analysis_text else 0}")
# Initialize OpenAI client
client = OpenAI()
# Extract clips from analysis text
social_clips = extract_clips_from_analysis(analysis_text)
if not social_clips:
print("No clips found in analysis text")
return []
processed_clips = []
for i, clip in enumerate(social_clips, 1):
print(f"Processing clip {i}/{len(social_clips)}: {clip.get('clip_title', 'Unknown')}")
start_time = clip.get("start_time", 0)
end_time = clip.get("end_time", 0)
# Extract relevant transcript content
transcript_content = extract_transcript_content(
transcript_data, start_time, end_time
)
if not transcript_content:
print(f"No transcript content found for clip {start_time}-{end_time}")
processed_clips.append(
{
**clip,
"broll_suggestions": [],
"error": "No transcript content found",
}
)
continue
# Generate B-roll queries
broll_queries = generate_broll_queries(client, transcript_content, clip)
broll_suggestions = []
for j, query_data in enumerate(broll_queries, 1):
print(f"Processing query {j}/{len(broll_queries)}: {query_data.get('query', 'Unknown')}")
query = query_data.get("query", "")
timestamp = query_data.get("timestamp_in_clip", 0)
reason = query_data.get("relevance_reason", "")
if not query:
continue
# Search Google Images if API is available
images = []
if google_api_key and search_engine_id:
print(f"Searching Google Images for: {query}")
images = search_google_images(
query, google_api_key, search_engine_id
)
print(f"Found {len(images)} images")
else:
print("Skipping Google Images search (no API credentials)")
broll_suggestion = {
"query": query,
"timestamp_in_clip": timestamp,
"absolute_timestamp": start_time + timestamp,
"relevance_reason": reason,
"images": images,
}
broll_suggestions.append(broll_suggestion)
processed_clip = {
**clip,
"transcript_content": transcript_content,
"broll_suggestions": broll_suggestions,
}
processed_clips.append(processed_clip)
print(f"Completed processing clip {i}, found {len(broll_suggestions)} suggestions")
print(f"B-roll generation complete. Processed {len(processed_clips)} clips")
return processed_clips
except Exception as e:
print(f"Error in process_broll_generation: {str(e)}")
raise e
def format_broll_output(processed_clips: List[Dict]) -> str:
"""
Format B-roll suggestions for display in the chat interface
Args:
processed_clips: List of processed clips with B-roll suggestions
Returns:
Formatted string for display
"""
if not processed_clips:
return "No B-roll suggestions generated."
output = ["🎬 B-Roll Suggestions\n"]
for i, clip in enumerate(processed_clips, 1):
title = clip.get("clip_title", "Unknown Clip")
start_time = clip.get("start_time", 0)
end_time = clip.get("end_time", 0)
# Format time display
start_min, start_sec = divmod(int(start_time), 60)
end_min, end_sec = divmod(int(end_time), 60)
output.append(f"\n{i}. {title}")
output.append(f"Time: {start_min:02d}:{start_sec:02d} - {end_min:02d}:{end_sec:02d}")
broll_suggestions = clip.get("broll_suggestions", [])
if not broll_suggestions:
output.append("No B-roll suggestions available for this clip.")
else:
for j, suggestion in enumerate(broll_suggestions, 1):
query = suggestion.get("query", "")
timestamp = suggestion.get("timestamp_in_clip", 0)
images = suggestion.get("images", [])
# Format timestamp within clip
ts_min, ts_sec = divmod(int(timestamp), 60)
output.append(f" Query {j}: {query}")
output.append(f" At: {ts_min:02d}:{ts_sec:02d}")
# Show top 2 image links only
if images:
top_images = images[:2]
for k, img in enumerate(top_images, 1):
img_url = img.get("image_url", "")
img_title = img.get("title", "Image")
if img_url:
output.append(f" Link {k}: {img_title[:50]} - {img_url}")
else:
output.append(" No images found for this query.")
output.append("")
return "\n".join(output)