AhmadMustafa commited on
Commit
f098be9
·
1 Parent(s): 521213a
Files changed (3) hide show
  1. app.py +96 -1
  2. broll_generator.py +391 -0
  3. utils.py +13 -0
app.py CHANGED
@@ -1,9 +1,12 @@
1
  import json
 
2
  from typing import Generator, List
3
 
4
  import gradio as gr
5
- from crop_utils import get_image_crop
6
  from openai import OpenAI
 
 
 
7
  from prompts import (
8
  get_chat_system_prompt,
9
  get_live_event_system_prompt,
@@ -319,6 +322,98 @@ def chat(
319
  ):
320
  yield content
321
  return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
  break # Exit streaming loop if tool calls detected
323
 
324
  if not tool_calls_detected and chunk.choices[0].delta.content is not None:
 
1
  import json
2
+ import os
3
  from typing import Generator, List
4
 
5
  import gradio as gr
 
6
  from openai import OpenAI
7
+
8
+ from broll_generator import format_broll_output, process_broll_generation
9
+ from crop_utils import get_image_crop
10
  from prompts import (
11
  get_chat_system_prompt,
12
  get_live_event_system_prompt,
 
322
  ):
323
  yield content
324
  return
325
+
326
+ elif tool_call.function.name == "generate_broll_suggestions":
327
+ # Generate B-roll suggestions based on the initial analysis
328
+ print("DOING B-ROLL GENERATION")
329
+ assistant_message = response.choices[0].message
330
+ messages.append(
331
+ {
332
+ "role": assistant_message.role,
333
+ "content": assistant_message.content or "",
334
+ "tool_calls": (
335
+ [
336
+ {
337
+ "id": tc.id,
338
+ "type": tc.type,
339
+ "function": {
340
+ "name": tc.function.name,
341
+ "arguments": tc.function.arguments,
342
+ },
343
+ }
344
+ for tc in assistant_message.tool_calls
345
+ ]
346
+ if assistant_message.tool_calls
347
+ else None
348
+ ),
349
+ }
350
+ )
351
+
352
+ # Get the initial analysis first (if not already done)
353
+ analysis_messages = []
354
+ # print(messages)
355
+ for msg in messages:
356
+ if msg["role"] == "assistant" and len(msg["content"]) > 100:
357
+ analysis_messages.append(msg["content"])
358
+
359
+ if analysis_messages:
360
+ # Use the most recent analysis text
361
+ analysis_text = analysis_messages[-1]
362
+
363
+ # Get transcript data
364
+ transcript_data = transcript_processor.segments
365
+
366
+ # Get Google API credentials from environment
367
+ google_api_key = os.getenv("GOOGLE_API_KEY")
368
+ search_engine_id = os.getenv("GOOGLE_SEARCH_ENGINE_ID")
369
+
370
+ try:
371
+ # Process B-roll generation
372
+ processed_clips = process_broll_generation(
373
+ transcript_data,
374
+ analysis_text,
375
+ google_api_key,
376
+ search_engine_id,
377
+ )
378
+
379
+ # Format the output
380
+ broll_output = format_broll_output(processed_clips)
381
+
382
+ function_call_result_message = {
383
+ "role": "tool",
384
+ "content": f"Generated B-roll suggestions for {len(processed_clips)} clips",
385
+ "name": tool_call.function.name,
386
+ "tool_call_id": tool_call.id,
387
+ }
388
+ messages.append(function_call_result_message)
389
+
390
+ yield broll_output
391
+ return
392
+
393
+ except Exception as e:
394
+ error_msg = (
395
+ f"Error generating B-roll suggestions: {str(e)}"
396
+ )
397
+ function_call_result_message = {
398
+ "role": "tool",
399
+ "content": error_msg,
400
+ "name": tool_call.function.name,
401
+ "tool_call_id": tool_call.id,
402
+ }
403
+ messages.append(function_call_result_message)
404
+ yield error_msg
405
+ return
406
+ else:
407
+ error_msg = "No analysis found. Please run the initial analysis first before generating B-roll suggestions."
408
+ function_call_result_message = {
409
+ "role": "tool",
410
+ "content": error_msg,
411
+ "name": tool_call.function.name,
412
+ "tool_call_id": tool_call.id,
413
+ }
414
+ messages.append(function_call_result_message)
415
+ yield error_msg
416
+ return
417
  break # Exit streaming loop if tool calls detected
418
 
419
  if not tool_calls_detected and chunk.choices[0].delta.content is not None:
broll_generator.py ADDED
@@ -0,0 +1,391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+ from typing import Dict, List, Tuple, Union
5
+
6
+ import requests
7
+ from openai import OpenAI
8
+
9
+
10
+ def extract_clips_from_analysis(analysis_text: str) -> List[Dict]:
11
+ """
12
+ Extract social media clips from the initial analysis output
13
+
14
+ Args:
15
+ analysis_text: The formatted analysis text from get_initial_analysis
16
+
17
+ Returns:
18
+ List of clip dictionaries with title, start_time, and end_time
19
+ """
20
+ print(f"Starting extract_clips_from_analysis with analysis_text length: {len(analysis_text)}")
21
+ clips = []
22
+
23
+ # Pattern to match clip links with timestamps
24
+ # Example: [Introduction and Event Overview <div id='topic' style="display: inline"> 40s at 03:25 </div>]
25
+ pattern = r"\[([^<]+)<div[^>]*>\s*(\d+)s\s+at\s+(\d{2}):(\d{2})\s*</div>\]"
26
+
27
+ matches = re.findall(pattern, analysis_text)
28
+ print(f"Found {len(matches)} matches in analysis text")
29
+
30
+ for match in matches:
31
+ title = match[0].strip()
32
+ duration = int(match[1])
33
+ minutes = int(match[2])
34
+ seconds = int(match[3])
35
+
36
+ start_time = minutes * 60 + seconds
37
+ end_time = start_time + duration
38
+
39
+ clip = {
40
+ "clip_title": title,
41
+ "start_time": start_time,
42
+ "end_time": end_time,
43
+ "duration": duration,
44
+ }
45
+ clips.append(clip)
46
+ print(f"Extracted clip: {title} ({start_time}-{end_time}s)")
47
+
48
+ print(f"Total clips extracted: {len(clips)}")
49
+ return clips
50
+
51
+
52
+ def extract_transcript_content(
53
+ transcript_data: List, start_time: float, end_time: float
54
+ ) -> str:
55
+ """
56
+ Extract transcript content between start and end times
57
+
58
+ Args:
59
+ transcript_data: List of transcript segments (TranscriptSegment objects or dicts)
60
+ start_time: Start time in seconds
61
+ end_time: End time in seconds
62
+
63
+ Returns:
64
+ Extracted transcript text
65
+ """
66
+ print(f"Extracting transcript content for {start_time}-{end_time}s from {len(transcript_data)} segments")
67
+ content = []
68
+
69
+ for segment in transcript_data:
70
+ # Handle both TranscriptSegment objects and dictionary formats
71
+ if hasattr(segment, "start_time") and hasattr(segment, "end_time"):
72
+ # TranscriptSegment object
73
+ segment_start = segment.start_time
74
+ segment_end = segment.end_time
75
+ segment_text = segment.text
76
+ elif hasattr(segment, "get"):
77
+ # Dictionary format
78
+ segment_start = segment.get("start_time", segment.get("start", 0))
79
+ segment_end = segment.get("end_time", segment.get("end", 0))
80
+ segment_text = segment.get("text", "")
81
+ else:
82
+ # Handle other object types with direct attribute access
83
+ segment_start = getattr(segment, "start_time", getattr(segment, "start", 0))
84
+ segment_end = getattr(segment, "end_time", getattr(segment, "end", 0))
85
+ segment_text = getattr(segment, "text", "")
86
+
87
+ # Check if segment overlaps with our time range
88
+ if segment_start <= end_time and segment_end >= start_time:
89
+ content.append(segment_text)
90
+
91
+ result = " ".join(content).strip()
92
+ print(f"Extracted {len(content)} segments, total text length: {len(result)}")
93
+ return result
94
+
95
+
96
+ def generate_broll_queries(
97
+ client: OpenAI, transcript_content: str, clip_data: Dict
98
+ ) -> List[Dict]:
99
+ """
100
+ Generate B-roll search queries using OpenAI based on transcript content and clip data
101
+
102
+ Args:
103
+ client: OpenAI client
104
+ transcript_content: Transcript text for the clip timeframe
105
+ clip_data: Social media clip data with timestamps
106
+
107
+ Returns:
108
+ List of query dictionaries with timestamps
109
+ """
110
+ duration = clip_data.get("end_time", 0) - clip_data.get("start_time", 0)
111
+ print(f"Generating B-roll queries for clip: {clip_data.get('clip_title', 'Unknown')}")
112
+
113
+ prompt = f"""
114
+ Analyze this transcript content from a social media clip and generate appropriate B-roll search queries.
115
+
116
+ Clip Title: {clip_data.get('clip_title', 'Unknown')}
117
+ Start Time: {clip_data.get('start_time', 0)} seconds
118
+ End Time: {clip_data.get('end_time', 0)} seconds
119
+ Duration: {duration} seconds
120
+
121
+ Transcript Content:
122
+ {transcript_content}
123
+
124
+ Generate 3-5 specific search queries that would find relevant B-roll images for this content.
125
+ For each query, specify the exact timestamp within the clip where it would be most relevant.
126
+
127
+ Focus on:
128
+ - Key people, places, or concepts mentioned
129
+ - Visual metaphors or illustrations
130
+ - Current events or topics discussed
131
+ - Products, companies, or brands mentioned
132
+
133
+ Return a JSON array with this structure:
134
+ [
135
+ {{
136
+ "query": "specific search query for Google Images",
137
+ "timestamp_in_clip": 5.2,
138
+ "relevance_reason": "why this image is relevant at this moment"
139
+ }}
140
+ ]
141
+
142
+ Ensure timestamps are between 0 and {duration} seconds.
143
+ Make queries specific and descriptive for better image search results.
144
+ """
145
+
146
+ try:
147
+ response = client.chat.completions.create(
148
+ model="gpt-4o",
149
+ messages=[
150
+ {
151
+ "role": "system",
152
+ "content": "You are an expert video editor specializing in finding relevant B-roll content for social media clips. Generate specific, searchable queries that will find compelling visual content.",
153
+ },
154
+ {"role": "user", "content": prompt},
155
+ ],
156
+ temperature=0.3,
157
+ )
158
+
159
+ response_text = response.choices[0].message.content
160
+
161
+ # Extract JSON from response
162
+ if "```json" in response_text and "```" in response_text.split("```json", 1)[1]:
163
+ json_text = response_text.split("```json", 1)[1].split("```", 1)[0]
164
+ queries = json.loads(json_text)
165
+ else:
166
+ queries = json.loads(response_text)
167
+
168
+ print(f"Generated {len(queries)} B-roll queries")
169
+ return queries
170
+
171
+ except Exception as e:
172
+ print(f"Error generating B-roll queries: {str(e)}")
173
+ return []
174
+
175
+
176
+ def search_google_images(
177
+ query: str, api_key: str, search_engine_id: str, num_results: int = 3
178
+ ) -> List[Dict]:
179
+ """
180
+ Search Google Images using Custom Search API
181
+
182
+ Args:
183
+ query: Search query string
184
+ api_key: Google API key
185
+ search_engine_id: Google Custom Search Engine ID
186
+ num_results: Number of results to return
187
+
188
+ Returns:
189
+ List of image result dictionaries
190
+ """
191
+ try:
192
+ url = "https://www.googleapis.com/customsearch/v1"
193
+ params = {
194
+ "key": api_key,
195
+ "cx": search_engine_id,
196
+ "q": query,
197
+ "searchType": "image",
198
+ "num": num_results,
199
+ "safe": "active",
200
+ "imgSize": "large",
201
+ "imgType": "photo",
202
+ }
203
+
204
+ response = requests.get(url, params=params)
205
+ response.raise_for_status()
206
+
207
+ data = response.json()
208
+ results = []
209
+
210
+ for item in data.get("items", []):
211
+ result = {
212
+ "title": item.get("title", ""),
213
+ "image_url": item.get("link", ""),
214
+ "thumbnail_url": item.get("image", {}).get("thumbnailLink", ""),
215
+ "context_url": item.get("image", {}).get("contextLink", ""),
216
+ "width": item.get("image", {}).get("width", 0),
217
+ "height": item.get("image", {}).get("height", 0),
218
+ "file_size": item.get("image", {}).get("byteSize", 0),
219
+ }
220
+ results.append(result)
221
+
222
+ return results
223
+
224
+ except Exception as e:
225
+ print(f"Error searching Google Images for query '{query}': {str(e)}")
226
+ return []
227
+
228
+
229
+ def process_broll_generation(
230
+ transcript_data: List,
231
+ analysis_text: str,
232
+ google_api_key: str = None,
233
+ search_engine_id: str = None,
234
+ ) -> List[Dict]:
235
+ """
236
+ Main processing function to generate B-roll content for social media clips
237
+
238
+ Args:
239
+ transcript_data: Full transcript data from TranscriptProcessor (list of TranscriptSegment objects or dicts)
240
+ analysis_text: The formatted analysis output from get_initial_analysis
241
+ google_api_key: Google API key for image search
242
+ search_engine_id: Google Custom Search Engine ID
243
+
244
+ Returns:
245
+ List of processed clips with B-roll suggestions
246
+ """
247
+ try:
248
+ print("Starting B-roll generation process")
249
+ print(f"Transcript data type: {type(transcript_data)}, length: {len(transcript_data) if transcript_data else 0}")
250
+ print(f"Analysis text length: {len(analysis_text) if analysis_text else 0}")
251
+
252
+ # Initialize OpenAI client
253
+ client = OpenAI()
254
+
255
+ # Extract clips from analysis text
256
+ social_clips = extract_clips_from_analysis(analysis_text)
257
+
258
+ if not social_clips:
259
+ print("No clips found in analysis text")
260
+ return []
261
+
262
+ processed_clips = []
263
+
264
+ for i, clip in enumerate(social_clips, 1):
265
+ print(f"Processing clip {i}/{len(social_clips)}: {clip.get('clip_title', 'Unknown')}")
266
+ start_time = clip.get("start_time", 0)
267
+ end_time = clip.get("end_time", 0)
268
+
269
+ # Extract relevant transcript content
270
+ transcript_content = extract_transcript_content(
271
+ transcript_data, start_time, end_time
272
+ )
273
+
274
+ if not transcript_content:
275
+ print(f"No transcript content found for clip {start_time}-{end_time}")
276
+ processed_clips.append(
277
+ {
278
+ **clip,
279
+ "broll_suggestions": [],
280
+ "error": "No transcript content found",
281
+ }
282
+ )
283
+ continue
284
+
285
+ # Generate B-roll queries
286
+ broll_queries = generate_broll_queries(client, transcript_content, clip)
287
+
288
+ broll_suggestions = []
289
+
290
+ for j, query_data in enumerate(broll_queries, 1):
291
+ print(f"Processing query {j}/{len(broll_queries)}: {query_data.get('query', 'Unknown')}")
292
+ query = query_data.get("query", "")
293
+ timestamp = query_data.get("timestamp_in_clip", 0)
294
+ reason = query_data.get("relevance_reason", "")
295
+
296
+ if not query:
297
+ continue
298
+
299
+ # Search Google Images if API is available
300
+ images = []
301
+ if google_api_key and search_engine_id:
302
+ print(f"Searching Google Images for: {query}")
303
+ images = search_google_images(
304
+ query, google_api_key, search_engine_id
305
+ )
306
+ print(f"Found {len(images)} images")
307
+ else:
308
+ print("Skipping Google Images search (no API credentials)")
309
+
310
+ broll_suggestion = {
311
+ "query": query,
312
+ "timestamp_in_clip": timestamp,
313
+ "absolute_timestamp": start_time + timestamp,
314
+ "relevance_reason": reason,
315
+ "images": images,
316
+ }
317
+ broll_suggestions.append(broll_suggestion)
318
+
319
+ processed_clip = {
320
+ **clip,
321
+ "transcript_content": transcript_content,
322
+ "broll_suggestions": broll_suggestions,
323
+ }
324
+ processed_clips.append(processed_clip)
325
+ print(f"Completed processing clip {i}, found {len(broll_suggestions)} suggestions")
326
+
327
+ print(f"B-roll generation complete. Processed {len(processed_clips)} clips")
328
+ return processed_clips
329
+
330
+ except Exception as e:
331
+ print(f"Error in process_broll_generation: {str(e)}")
332
+ raise e
333
+
334
+
335
+ def format_broll_output(processed_clips: List[Dict]) -> str:
336
+ """
337
+ Format B-roll suggestions for display in the chat interface
338
+
339
+ Args:
340
+ processed_clips: List of processed clips with B-roll suggestions
341
+
342
+ Returns:
343
+ Formatted string for display
344
+ """
345
+ if not processed_clips:
346
+ return "No B-roll suggestions generated."
347
+
348
+ output = ["🎬 B-Roll Suggestions\n"]
349
+
350
+ for i, clip in enumerate(processed_clips, 1):
351
+ title = clip.get("clip_title", "Unknown Clip")
352
+ start_time = clip.get("start_time", 0)
353
+ end_time = clip.get("end_time", 0)
354
+
355
+ # Format time display
356
+ start_min, start_sec = divmod(int(start_time), 60)
357
+ end_min, end_sec = divmod(int(end_time), 60)
358
+
359
+ output.append(f"\n{i}. {title}")
360
+ output.append(f"Time: {start_min:02d}:{start_sec:02d} - {end_min:02d}:{end_sec:02d}")
361
+
362
+ broll_suggestions = clip.get("broll_suggestions", [])
363
+
364
+ if not broll_suggestions:
365
+ output.append("No B-roll suggestions available for this clip.")
366
+ else:
367
+ for j, suggestion in enumerate(broll_suggestions, 1):
368
+ query = suggestion.get("query", "")
369
+ timestamp = suggestion.get("timestamp_in_clip", 0)
370
+ images = suggestion.get("images", [])
371
+
372
+ # Format timestamp within clip
373
+ ts_min, ts_sec = divmod(int(timestamp), 60)
374
+
375
+ output.append(f" Query {j}: {query}")
376
+ output.append(f" At: {ts_min:02d}:{ts_sec:02d}")
377
+
378
+ # Show top 2 image links only
379
+ if images:
380
+ top_images = images[:2]
381
+ for k, img in enumerate(top_images, 1):
382
+ img_url = img.get("image_url", "")
383
+ img_title = img.get("title", "Image")
384
+ if img_url:
385
+ output.append(f" Link {k}: {img_title[:50]} - {img_url}")
386
+ else:
387
+ output.append(" No images found for this query.")
388
+
389
+ output.append("")
390
+
391
+ return "\n".join(output)
utils.py CHANGED
@@ -97,6 +97,19 @@ openai_tools = [
97
  },
98
  },
99
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  ]
101
 
102
  css = """
 
97
  },
98
  },
99
  },
100
+ {
101
+ "type": "function",
102
+ "function": {
103
+ "name": "generate_broll_suggestions",
104
+ "description": "Generate B-roll image suggestions for social media clips. Call this function when user asks for B-roll images, video suggestions, or visual content for the clips.",
105
+ "parameters": {
106
+ "type": "object",
107
+ "properties": {},
108
+ "required": [],
109
+ "additionalProperties": False,
110
+ },
111
+ },
112
+ },
113
  ]
114
 
115
  css = """