Spaces:

RollAI
/

ChatWithTranscript

Running

App Files Files Community

ChatWithTranscript / broll_generator.py

AhmadMustafa

b roll

f098be9 about 1 month ago

raw

history blame contribute delete

13.9 kB

	import json
	import os
	import re
	from typing import Dict, List, Tuple, Union

	import requests
	from openai import OpenAI


	def extract_clips_from_analysis(analysis_text: str) -> List[Dict]:
	"""
	Extract social media clips from the initial analysis output

	Args:
	analysis_text: The formatted analysis text from get_initial_analysis

	Returns:
	List of clip dictionaries with title, start_time, and end_time
	"""
	print(f"Starting extract_clips_from_analysis with analysis_text length: {len(analysis_text)}")
	clips = []

	# Pattern to match clip links with timestamps
	# Example: [Introduction and Event Overview <div id='topic' style="display: inline"> 40s at 03:25 </div>]
	pattern = r"\[([^<]+)<div[^>]>\s(\d+)s\s+at\s+(\d{2}):(\d{2})\s*</div>\]"

	matches = re.findall(pattern, analysis_text)
	print(f"Found {len(matches)} matches in analysis text")

	for match in matches:
	title = match[0].strip()
	duration = int(match[1])
	minutes = int(match[2])
	seconds = int(match[3])

	start_time = minutes * 60 + seconds
	end_time = start_time + duration

	clip = {
	"clip_title": title,
	"start_time": start_time,
	"end_time": end_time,
	"duration": duration,
	}
	clips.append(clip)
	print(f"Extracted clip: {title} ({start_time}-{end_time}s)")

	print(f"Total clips extracted: {len(clips)}")
	return clips


	def extract_transcript_content(
	transcript_data: List, start_time: float, end_time: float
	) -> str:
	"""
	Extract transcript content between start and end times

	Args:
	transcript_data: List of transcript segments (TranscriptSegment objects or dicts)
	start_time: Start time in seconds
	end_time: End time in seconds

	Returns:
	Extracted transcript text
	"""
	print(f"Extracting transcript content for {start_time}-{end_time}s from {len(transcript_data)} segments")
	content = []

	for segment in transcript_data:
	# Handle both TranscriptSegment objects and dictionary formats
	if hasattr(segment, "start_time") and hasattr(segment, "end_time"):
	# TranscriptSegment object
	segment_start = segment.start_time
	segment_end = segment.end_time
	segment_text = segment.text
	elif hasattr(segment, "get"):
	# Dictionary format
	segment_start = segment.get("start_time", segment.get("start", 0))
	segment_end = segment.get("end_time", segment.get("end", 0))
	segment_text = segment.get("text", "")
	else:
	# Handle other object types with direct attribute access
	segment_start = getattr(segment, "start_time", getattr(segment, "start", 0))
	segment_end = getattr(segment, "end_time", getattr(segment, "end", 0))
	segment_text = getattr(segment, "text", "")

	# Check if segment overlaps with our time range
	if segment_start <= end_time and segment_end >= start_time:
	content.append(segment_text)

	result = " ".join(content).strip()
	print(f"Extracted {len(content)} segments, total text length: {len(result)}")
	return result


	def generate_broll_queries(
	client: OpenAI, transcript_content: str, clip_data: Dict
	) -> List[Dict]:
	"""
	Generate B-roll search queries using OpenAI based on transcript content and clip data

	Args:
	client: OpenAI client
	transcript_content: Transcript text for the clip timeframe
	clip_data: Social media clip data with timestamps

	Returns:
	List of query dictionaries with timestamps
	"""
	duration = clip_data.get("end_time", 0) - clip_data.get("start_time", 0)
	print(f"Generating B-roll queries for clip: {clip_data.get('clip_title', 'Unknown')}")

	prompt = f"""
	Analyze this transcript content from a social media clip and generate appropriate B-roll search queries.

	Clip Title: {clip_data.get('clip_title', 'Unknown')}
	Start Time: {clip_data.get('start_time', 0)} seconds
	End Time: {clip_data.get('end_time', 0)} seconds
	Duration: {duration} seconds

	Transcript Content:
	{transcript_content}

	Generate 3-5 specific search queries that would find relevant B-roll images for this content.
	For each query, specify the exact timestamp within the clip where it would be most relevant.

	Focus on:
	- Key people, places, or concepts mentioned
	- Visual metaphors or illustrations
	- Current events or topics discussed
	- Products, companies, or brands mentioned

	Return a JSON array with this structure:
	[
	{{
	"query": "specific search query for Google Images",
	"timestamp_in_clip": 5.2,
	"relevance_reason": "why this image is relevant at this moment"
	}}
	]

	Ensure timestamps are between 0 and {duration} seconds.
	Make queries specific and descriptive for better image search results.
	"""

	try:
	response = client.chat.completions.create(
	model="gpt-4o",
	messages=[
	{
	"role": "system",
	"content": "You are an expert video editor specializing in finding relevant B-roll content for social media clips. Generate specific, searchable queries that will find compelling visual content.",
	},
	{"role": "user", "content": prompt},
	],
	temperature=0.3,
	)

	response_text = response.choices[0].message.content

	# Extract JSON from response
	if "```json" in response_text and "```" in response_text.split("```json", 1)[1]:
	json_text = response_text.split("```json", 1)[1].split("```", 1)[0]
	queries = json.loads(json_text)
	else:
	queries = json.loads(response_text)

	print(f"Generated {len(queries)} B-roll queries")
	return queries

	except Exception as e:
	print(f"Error generating B-roll queries: {str(e)}")
	return []


	def search_google_images(
	query: str, api_key: str, search_engine_id: str, num_results: int = 3
	) -> List[Dict]:
	"""
	Search Google Images using Custom Search API

	Args:
	query: Search query string
	api_key: Google API key
	search_engine_id: Google Custom Search Engine ID
	num_results: Number of results to return

	Returns:
	List of image result dictionaries
	"""
	try:
	url = "https://www.googleapis.com/customsearch/v1"
	params = {
	"key": api_key,
	"cx": search_engine_id,
	"q": query,
	"searchType": "image",
	"num": num_results,
	"safe": "active",
	"imgSize": "large",
	"imgType": "photo",
	}

	response = requests.get(url, params=params)
	response.raise_for_status()

	data = response.json()
	results = []

	for item in data.get("items", []):
	result = {
	"title": item.get("title", ""),
	"image_url": item.get("link", ""),
	"thumbnail_url": item.get("image", {}).get("thumbnailLink", ""),
	"context_url": item.get("image", {}).get("contextLink", ""),
	"width": item.get("image", {}).get("width", 0),
	"height": item.get("image", {}).get("height", 0),
	"file_size": item.get("image", {}).get("byteSize", 0),
	}
	results.append(result)

	return results

	except Exception as e:
	print(f"Error searching Google Images for query '{query}': {str(e)}")
	return []


	def process_broll_generation(
	transcript_data: List,
	analysis_text: str,
	google_api_key: str = None,
	search_engine_id: str = None,
	) -> List[Dict]:
	"""
	Main processing function to generate B-roll content for social media clips

	Args:
	transcript_data: Full transcript data from TranscriptProcessor (list of TranscriptSegment objects or dicts)
	analysis_text: The formatted analysis output from get_initial_analysis
	google_api_key: Google API key for image search
	search_engine_id: Google Custom Search Engine ID

	Returns:
	List of processed clips with B-roll suggestions
	"""
	try:
	print("Starting B-roll generation process")
	print(f"Transcript data type: {type(transcript_data)}, length: {len(transcript_data) if transcript_data else 0}")
	print(f"Analysis text length: {len(analysis_text) if analysis_text else 0}")

	# Initialize OpenAI client
	client = OpenAI()

	# Extract clips from analysis text
	social_clips = extract_clips_from_analysis(analysis_text)

	if not social_clips:
	print("No clips found in analysis text")
	return []

	processed_clips = []

	for i, clip in enumerate(social_clips, 1):
	print(f"Processing clip {i}/{len(social_clips)}: {clip.get('clip_title', 'Unknown')}")
	start_time = clip.get("start_time", 0)
	end_time = clip.get("end_time", 0)

	# Extract relevant transcript content
	transcript_content = extract_transcript_content(
	transcript_data, start_time, end_time
	)

	if not transcript_content:
	print(f"No transcript content found for clip {start_time}-{end_time}")
	processed_clips.append(
	{
	**clip,
	"broll_suggestions": [],
	"error": "No transcript content found",
	}
	)
	continue

	# Generate B-roll queries
	broll_queries = generate_broll_queries(client, transcript_content, clip)

	broll_suggestions = []

	for j, query_data in enumerate(broll_queries, 1):
	print(f"Processing query {j}/{len(broll_queries)}: {query_data.get('query', 'Unknown')}")
	query = query_data.get("query", "")
	timestamp = query_data.get("timestamp_in_clip", 0)
	reason = query_data.get("relevance_reason", "")

	if not query:
	continue

	# Search Google Images if API is available
	images = []
	if google_api_key and search_engine_id:
	print(f"Searching Google Images for: {query}")
	images = search_google_images(
	query, google_api_key, search_engine_id
	)
	print(f"Found {len(images)} images")
	else:
	print("Skipping Google Images search (no API credentials)")

	broll_suggestion = {
	"query": query,
	"timestamp_in_clip": timestamp,
	"absolute_timestamp": start_time + timestamp,
	"relevance_reason": reason,
	"images": images,
	}
	broll_suggestions.append(broll_suggestion)

	processed_clip = {
	**clip,
	"transcript_content": transcript_content,
	"broll_suggestions": broll_suggestions,
	}
	processed_clips.append(processed_clip)
	print(f"Completed processing clip {i}, found {len(broll_suggestions)} suggestions")

	print(f"B-roll generation complete. Processed {len(processed_clips)} clips")
	return processed_clips

	except Exception as e:
	print(f"Error in process_broll_generation: {str(e)}")
	raise e


	def format_broll_output(processed_clips: List[Dict]) -> str:
	"""
	Format B-roll suggestions for display in the chat interface

	Args:
	processed_clips: List of processed clips with B-roll suggestions

	Returns:
	Formatted string for display
	"""
	if not processed_clips:
	return "No B-roll suggestions generated."

	output = ["🎬 B-Roll Suggestions\n"]

	for i, clip in enumerate(processed_clips, 1):
	title = clip.get("clip_title", "Unknown Clip")
	start_time = clip.get("start_time", 0)
	end_time = clip.get("end_time", 0)

	# Format time display
	start_min, start_sec = divmod(int(start_time), 60)
	end_min, end_sec = divmod(int(end_time), 60)

	output.append(f"\n{i}. {title}")
	output.append(f"Time: {start_min:02d}:{start_sec:02d} - {end_min:02d}:{end_sec:02d}")

	broll_suggestions = clip.get("broll_suggestions", [])

	if not broll_suggestions:
	output.append("No B-roll suggestions available for this clip.")
	else:
	for j, suggestion in enumerate(broll_suggestions, 1):
	query = suggestion.get("query", "")
	timestamp = suggestion.get("timestamp_in_clip", 0)
	images = suggestion.get("images", [])

	# Format timestamp within clip
	ts_min, ts_sec = divmod(int(timestamp), 60)

	output.append(f" Query {j}: {query}")
	output.append(f" At: {ts_min:02d}:{ts_sec:02d}")

	# Show top 2 image links only
	if images:
	top_images = images[:2]
	for k, img in enumerate(top_images, 1):
	img_url = img.get("image_url", "")
	img_title = img.get("title", "Image")
	if img_url:
	output.append(f" Link {k}: {img_title[:50]} - {img_url}")
	else:
	output.append(" No images found for this query.")

	output.append("")

	return "\n".join(output)