Spaces:

RollAI
/

ChatWithTranscript

Running

App Files Files Community

AhmadMustafa commited on Jun 12

Commit

f098be9

1 Parent(s): 521213a

b roll

Browse files

Files changed (3) hide show

app.py +96 -1
broll_generator.py +391 -0
utils.py +13 -0

app.py CHANGED Viewed

@@ -1,9 +1,12 @@
 import json
 from typing import Generator, List
 import gradio as gr
-from crop_utils import get_image_crop
 from openai import OpenAI
 from prompts import (
     get_chat_system_prompt,
     get_live_event_system_prompt,
@@ -319,6 +322,98 @@ def chat(
                             ):
                                 yield content
                             return
                 break  # Exit streaming loop if tool calls detected
             if not tool_calls_detected and chunk.choices[0].delta.content is not None:

 import json
+import os
 from typing import Generator, List
 import gradio as gr
 from openai import OpenAI
+from broll_generator import format_broll_output, process_broll_generation
+from crop_utils import get_image_crop
 from prompts import (
     get_chat_system_prompt,
     get_live_event_system_prompt,
                             ):
                                 yield content
                             return
+                    elif tool_call.function.name == "generate_broll_suggestions":
+                        # Generate B-roll suggestions based on the initial analysis
+                        print("DOING B-ROLL GENERATION")
+                        assistant_message = response.choices[0].message
+                        messages.append(
+                            {
+                                "role": assistant_message.role,
+                                "content": assistant_message.content or "",
+                                "tool_calls": (
+                                    [
+                                        {
+                                            "id": tc.id,
+                                            "type": tc.type,
+                                            "function": {
+                                                "name": tc.function.name,
+                                                "arguments": tc.function.arguments,
+                                            },
+                                        }
+                                        for tc in assistant_message.tool_calls
+                                    ]
+                                    if assistant_message.tool_calls
+                                    else None
+                                ),
+                            }
+                        )
+                        # Get the initial analysis first (if not already done)
+                        analysis_messages = []
+                        # print(messages)
+                        for msg in messages:
+                            if msg["role"] == "assistant" and len(msg["content"]) > 100:
+                                analysis_messages.append(msg["content"])
+                        if analysis_messages:
+                            # Use the most recent analysis text
+                            analysis_text = analysis_messages[-1]
+                            # Get transcript data
+                            transcript_data = transcript_processor.segments
+                            # Get Google API credentials from environment
+                            google_api_key = os.getenv("GOOGLE_API_KEY")
+                            search_engine_id = os.getenv("GOOGLE_SEARCH_ENGINE_ID")
+                            try:
+                                # Process B-roll generation
+                                processed_clips = process_broll_generation(
+                                    transcript_data,
+                                    analysis_text,
+                                    google_api_key,
+                                    search_engine_id,
+                                )
+                                # Format the output
+                                broll_output = format_broll_output(processed_clips)
+                                function_call_result_message = {
+                                    "role": "tool",
+                                    "content": f"Generated B-roll suggestions for {len(processed_clips)} clips",
+                                    "name": tool_call.function.name,
+                                    "tool_call_id": tool_call.id,
+                                }
+                                messages.append(function_call_result_message)
+                                yield broll_output
+                                return
+                            except Exception as e:
+                                error_msg = (
+                                    f"Error generating B-roll suggestions: {str(e)}"
+                                )
+                                function_call_result_message = {
+                                    "role": "tool",
+                                    "content": error_msg,
+                                    "name": tool_call.function.name,
+                                    "tool_call_id": tool_call.id,
+                                }
+                                messages.append(function_call_result_message)
+                                yield error_msg
+                                return
+                        else:
+                            error_msg = "No analysis found. Please run the initial analysis first before generating B-roll suggestions."
+                            function_call_result_message = {
+                                "role": "tool",
+                                "content": error_msg,
+                                "name": tool_call.function.name,
+                                "tool_call_id": tool_call.id,
+                            }
+                            messages.append(function_call_result_message)
+                            yield error_msg
+                            return
                 break  # Exit streaming loop if tool calls detected
             if not tool_calls_detected and chunk.choices[0].delta.content is not None:

broll_generator.py ADDED Viewed

	@@ -0,0 +1,391 @@

+import json
+import os
+import re
+from typing import Dict, List, Tuple, Union
+import requests
+from openai import OpenAI
+def extract_clips_from_analysis(analysis_text: str) -> List[Dict]:
+    """
+    Extract social media clips from the initial analysis output
+    Args:
+        analysis_text: The formatted analysis text from get_initial_analysis
+    Returns:
+        List of clip dictionaries with title, start_time, and end_time
+    """
+    print(f"Starting extract_clips_from_analysis with analysis_text length: {len(analysis_text)}")
+    clips = []
+    # Pattern to match clip links with timestamps
+    # Example: [Introduction and Event Overview <div id='topic' style="display: inline"> 40s at 03:25 </div>]
+    pattern = r"\[([^<]+)<div[^>]*>\s*(\d+)s\s+at\s+(\d{2}):(\d{2})\s*</div>\]"
+    matches = re.findall(pattern, analysis_text)
+    print(f"Found {len(matches)} matches in analysis text")
+    for match in matches:
+        title = match[0].strip()
+        duration = int(match[1])
+        minutes = int(match[2])
+        seconds = int(match[3])
+        start_time = minutes * 60 + seconds
+        end_time = start_time + duration
+        clip = {
+            "clip_title": title,
+            "start_time": start_time,
+            "end_time": end_time,
+            "duration": duration,
+        }
+        clips.append(clip)
+        print(f"Extracted clip: {title} ({start_time}-{end_time}s)")
+    print(f"Total clips extracted: {len(clips)}")
+    return clips
+def extract_transcript_content(
+    transcript_data: List, start_time: float, end_time: float
+) -> str:
+    """
+    Extract transcript content between start and end times
+    Args:
+        transcript_data: List of transcript segments (TranscriptSegment objects or dicts)
+        start_time: Start time in seconds
+        end_time: End time in seconds
+    Returns:
+        Extracted transcript text
+    """
+    print(f"Extracting transcript content for {start_time}-{end_time}s from {len(transcript_data)} segments")
+    content = []
+    for segment in transcript_data:
+        # Handle both TranscriptSegment objects and dictionary formats
+        if hasattr(segment, "start_time") and hasattr(segment, "end_time"):
+            # TranscriptSegment object
+            segment_start = segment.start_time
+            segment_end = segment.end_time
+            segment_text = segment.text
+        elif hasattr(segment, "get"):
+            # Dictionary format
+            segment_start = segment.get("start_time", segment.get("start", 0))
+            segment_end = segment.get("end_time", segment.get("end", 0))
+            segment_text = segment.get("text", "")
+        else:
+            # Handle other object types with direct attribute access
+            segment_start = getattr(segment, "start_time", getattr(segment, "start", 0))
+            segment_end = getattr(segment, "end_time", getattr(segment, "end", 0))
+            segment_text = getattr(segment, "text", "")
+        # Check if segment overlaps with our time range
+        if segment_start <= end_time and segment_end >= start_time:
+            content.append(segment_text)
+    result = " ".join(content).strip()
+    print(f"Extracted {len(content)} segments, total text length: {len(result)}")
+    return result
+def generate_broll_queries(
+    client: OpenAI, transcript_content: str, clip_data: Dict
+) -> List[Dict]:
+    """
+    Generate B-roll search queries using OpenAI based on transcript content and clip data
+    Args:
+        client: OpenAI client
+        transcript_content: Transcript text for the clip timeframe
+        clip_data: Social media clip data with timestamps
+    Returns:
+        List of query dictionaries with timestamps
+    """
+    duration = clip_data.get("end_time", 0) - clip_data.get("start_time", 0)
+    print(f"Generating B-roll queries for clip: {clip_data.get('clip_title', 'Unknown')}")
+    prompt = f"""
+    Analyze this transcript content from a social media clip and generate appropriate B-roll search queries.
+    Clip Title: {clip_data.get('clip_title', 'Unknown')}
+    Start Time: {clip_data.get('start_time', 0)} seconds
+    End Time: {clip_data.get('end_time', 0)} seconds
+    Duration: {duration} seconds
+    Transcript Content:
+    {transcript_content}
+    Generate 3-5 specific search queries that would find relevant B-roll images for this content.
+    For each query, specify the exact timestamp within the clip where it would be most relevant.
+    Focus on:
+    - Key people, places, or concepts mentioned
+    - Visual metaphors or illustrations
+    - Current events or topics discussed
+    - Products, companies, or brands mentioned
+    Return a JSON array with this structure:
+    [
+        {{
+            "query": "specific search query for Google Images",
+            "timestamp_in_clip": 5.2,
+            "relevance_reason": "why this image is relevant at this moment"
+        }}
+    ]
+    Ensure timestamps are between 0 and {duration} seconds.
+    Make queries specific and descriptive for better image search results.
+    """
+    try:
+        response = client.chat.completions.create(
+            model="gpt-4o",
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are an expert video editor specializing in finding relevant B-roll content for social media clips. Generate specific, searchable queries that will find compelling visual content.",
+                },
+                {"role": "user", "content": prompt},
+            ],
+            temperature=0.3,
+        )
+        response_text = response.choices[0].message.content
+        # Extract JSON from response
+        if "```json" in response_text and "```" in response_text.split("```json", 1)[1]:
+            json_text = response_text.split("```json", 1)[1].split("```", 1)[0]
+            queries = json.loads(json_text)
+        else:
+            queries = json.loads(response_text)
+        print(f"Generated {len(queries)} B-roll queries")
+        return queries
+    except Exception as e:
+        print(f"Error generating B-roll queries: {str(e)}")
+        return []
+def search_google_images(
+    query: str, api_key: str, search_engine_id: str, num_results: int = 3
+) -> List[Dict]:
+    """
+    Search Google Images using Custom Search API
+    Args:
+        query: Search query string
+        api_key: Google API key
+        search_engine_id: Google Custom Search Engine ID
+        num_results: Number of results to return
+    Returns:
+        List of image result dictionaries
+    """
+    try:
+        url = "https://www.googleapis.com/customsearch/v1"
+        params = {
+            "key": api_key,
+            "cx": search_engine_id,
+            "q": query,
+            "searchType": "image",
+            "num": num_results,
+            "safe": "active",
+            "imgSize": "large",
+            "imgType": "photo",
+        }
+        response = requests.get(url, params=params)
+        response.raise_for_status()
+        data = response.json()
+        results = []
+        for item in data.get("items", []):
+            result = {
+                "title": item.get("title", ""),
+                "image_url": item.get("link", ""),
+                "thumbnail_url": item.get("image", {}).get("thumbnailLink", ""),
+                "context_url": item.get("image", {}).get("contextLink", ""),
+                "width": item.get("image", {}).get("width", 0),
+                "height": item.get("image", {}).get("height", 0),
+                "file_size": item.get("image", {}).get("byteSize", 0),
+            }
+            results.append(result)
+        return results
+    except Exception as e:
+        print(f"Error searching Google Images for query '{query}': {str(e)}")
+        return []
+def process_broll_generation(
+    transcript_data: List,
+    analysis_text: str,
+    google_api_key: str = None,
+    search_engine_id: str = None,
+) -> List[Dict]:
+    """
+    Main processing function to generate B-roll content for social media clips
+    Args:
+        transcript_data: Full transcript data from TranscriptProcessor (list of TranscriptSegment objects or dicts)
+        analysis_text: The formatted analysis output from get_initial_analysis
+        google_api_key: Google API key for image search
+        search_engine_id: Google Custom Search Engine ID
+    Returns:
+        List of processed clips with B-roll suggestions
+    """
+    try:
+        print("Starting B-roll generation process")
+        print(f"Transcript data type: {type(transcript_data)}, length: {len(transcript_data) if transcript_data else 0}")
+        print(f"Analysis text length: {len(analysis_text) if analysis_text else 0}")
+        # Initialize OpenAI client
+        client = OpenAI()
+        # Extract clips from analysis text
+        social_clips = extract_clips_from_analysis(analysis_text)
+        if not social_clips:
+            print("No clips found in analysis text")
+            return []
+        processed_clips = []
+        for i, clip in enumerate(social_clips, 1):
+            print(f"Processing clip {i}/{len(social_clips)}: {clip.get('clip_title', 'Unknown')}")
+            start_time = clip.get("start_time", 0)
+            end_time = clip.get("end_time", 0)
+            # Extract relevant transcript content
+            transcript_content = extract_transcript_content(
+                transcript_data, start_time, end_time
+            )
+            if not transcript_content:
+                print(f"No transcript content found for clip {start_time}-{end_time}")
+                processed_clips.append(
+                    {
+                        **clip,
+                        "broll_suggestions": [],
+                        "error": "No transcript content found",
+                    }
+                )
+                continue
+            # Generate B-roll queries
+            broll_queries = generate_broll_queries(client, transcript_content, clip)
+            broll_suggestions = []
+            for j, query_data in enumerate(broll_queries, 1):
+                print(f"Processing query {j}/{len(broll_queries)}: {query_data.get('query', 'Unknown')}")
+                query = query_data.get("query", "")
+                timestamp = query_data.get("timestamp_in_clip", 0)
+                reason = query_data.get("relevance_reason", "")
+                if not query:
+                    continue
+                # Search Google Images if API is available
+                images = []
+                if google_api_key and search_engine_id:
+                    print(f"Searching Google Images for: {query}")
+                    images = search_google_images(
+                        query, google_api_key, search_engine_id
+                    )
+                    print(f"Found {len(images)} images")
+                else:
+                    print("Skipping Google Images search (no API credentials)")
+                broll_suggestion = {
+                    "query": query,
+                    "timestamp_in_clip": timestamp,
+                    "absolute_timestamp": start_time + timestamp,
+                    "relevance_reason": reason,
+                    "images": images,
+                }
+                broll_suggestions.append(broll_suggestion)
+            processed_clip = {
+                **clip,
+                "transcript_content": transcript_content,
+                "broll_suggestions": broll_suggestions,
+            }
+            processed_clips.append(processed_clip)
+            print(f"Completed processing clip {i}, found {len(broll_suggestions)} suggestions")
+        print(f"B-roll generation complete. Processed {len(processed_clips)} clips")
+        return processed_clips
+    except Exception as e:
+        print(f"Error in process_broll_generation: {str(e)}")
+        raise e
+def format_broll_output(processed_clips: List[Dict]) -> str:
+    """
+    Format B-roll suggestions for display in the chat interface
+    Args:
+        processed_clips: List of processed clips with B-roll suggestions
+    Returns:
+        Formatted string for display
+    """
+    if not processed_clips:
+        return "No B-roll suggestions generated."
+    output = ["🎬 B-Roll Suggestions\n"]
+    for i, clip in enumerate(processed_clips, 1):
+        title = clip.get("clip_title", "Unknown Clip")
+        start_time = clip.get("start_time", 0)
+        end_time = clip.get("end_time", 0)
+        # Format time display
+        start_min, start_sec = divmod(int(start_time), 60)
+        end_min, end_sec = divmod(int(end_time), 60)
+        output.append(f"\n{i}. {title}")
+        output.append(f"Time: {start_min:02d}:{start_sec:02d} - {end_min:02d}:{end_sec:02d}")
+        broll_suggestions = clip.get("broll_suggestions", [])
+        if not broll_suggestions:
+            output.append("No B-roll suggestions available for this clip.")
+        else:
+            for j, suggestion in enumerate(broll_suggestions, 1):
+                query = suggestion.get("query", "")
+                timestamp = suggestion.get("timestamp_in_clip", 0)
+                images = suggestion.get("images", [])
+                # Format timestamp within clip
+                ts_min, ts_sec = divmod(int(timestamp), 60)
+                output.append(f"  Query {j}: {query}")
+                output.append(f"  At: {ts_min:02d}:{ts_sec:02d}")
+                # Show top 2 image links only
+                if images:
+                    top_images = images[:2]
+                    for k, img in enumerate(top_images, 1):
+                        img_url = img.get("image_url", "")
+                        img_title = img.get("title", "Image")
+                        if img_url:
+                            output.append(f"    Link {k}: {img_title[:50]} - {img_url}")
+                else:
+                    output.append("    No images found for this query.")
+        output.append("")
+    return "\n".join(output)

utils.py CHANGED Viewed

@@ -97,6 +97,19 @@ openai_tools = [
             },
         },
     },
 ]
 css = """

             },
         },
     },
+    {
+        "type": "function",
+        "function": {
+            "name": "generate_broll_suggestions",
+            "description": "Generate B-roll image suggestions for social media clips. Call this function when user asks for B-roll images, video suggestions, or visual content for the clips.",
+            "parameters": {
+                "type": "object",
+                "properties": {},
+                "required": [],
+                "additionalProperties": False,
+            },
+        },
+    },
 ]
 css = """