Spaces:

RollAI
/

ChatWithTranscript

Running

File size: 13,905 Bytes

f098be9

import json
import os
import re
from typing import Dict, List, Tuple, Union

import requests
from openai import OpenAI


def extract_clips_from_analysis(analysis_text: str) -> List[Dict]:
    """
    Extract social media clips from the initial analysis output

    Args:
        analysis_text: The formatted analysis text from get_initial_analysis

    Returns:
        List of clip dictionaries with title, start_time, and end_time
    """
    print(f"Starting extract_clips_from_analysis with analysis_text length: {len(analysis_text)}")
    clips = []

    # Pattern to match clip links with timestamps
    # Example: [Introduction and Event Overview <div id='topic' style="display: inline"> 40s at 03:25 </div>]
    pattern = r"\[([^<]+)<div[^>]*>\s*(\d+)s\s+at\s+(\d{2}):(\d{2})\s*</div>\]"

    matches = re.findall(pattern, analysis_text)
    print(f"Found {len(matches)} matches in analysis text")

    for match in matches:
        title = match[0].strip()
        duration = int(match[1])
        minutes = int(match[2])
        seconds = int(match[3])

        start_time = minutes * 60 + seconds
        end_time = start_time + duration

        clip = {
            "clip_title": title,
            "start_time": start_time,
            "end_time": end_time,
            "duration": duration,
        }
        clips.append(clip)
        print(f"Extracted clip: {title} ({start_time}-{end_time}s)")

    print(f"Total clips extracted: {len(clips)}")
    return clips


def extract_transcript_content(
    transcript_data: List, start_time: float, end_time: float
) -> str:
    """
    Extract transcript content between start and end times

    Args:
        transcript_data: List of transcript segments (TranscriptSegment objects or dicts)
        start_time: Start time in seconds
        end_time: End time in seconds

    Returns:
        Extracted transcript text
    """
    print(f"Extracting transcript content for {start_time}-{end_time}s from {len(transcript_data)} segments")
    content = []

    for segment in transcript_data:
        # Handle both TranscriptSegment objects and dictionary formats
        if hasattr(segment, "start_time") and hasattr(segment, "end_time"):
            # TranscriptSegment object
            segment_start = segment.start_time
            segment_end = segment.end_time
            segment_text = segment.text
        elif hasattr(segment, "get"):
            # Dictionary format
            segment_start = segment.get("start_time", segment.get("start", 0))
            segment_end = segment.get("end_time", segment.get("end", 0))
            segment_text = segment.get("text", "")
        else:
            # Handle other object types with direct attribute access
            segment_start = getattr(segment, "start_time", getattr(segment, "start", 0))
            segment_end = getattr(segment, "end_time", getattr(segment, "end", 0))
            segment_text = getattr(segment, "text", "")

        # Check if segment overlaps with our time range
        if segment_start <= end_time and segment_end >= start_time:
            content.append(segment_text)

    result = " ".join(content).strip()
    print(f"Extracted {len(content)} segments, total text length: {len(result)}")
    return result


def generate_broll_queries(
    client: OpenAI, transcript_content: str, clip_data: Dict
) -> List[Dict]:
    """
    Generate B-roll search queries using OpenAI based on transcript content and clip data

    Args:
        client: OpenAI client
        transcript_content: Transcript text for the clip timeframe
        clip_data: Social media clip data with timestamps

    Returns:
        List of query dictionaries with timestamps
    """
    duration = clip_data.get("end_time", 0) - clip_data.get("start_time", 0)
    print(f"Generating B-roll queries for clip: {clip_data.get('clip_title', 'Unknown')}")

    prompt = f"""
    Analyze this transcript content from a social media clip and generate appropriate B-roll search queries.

    Clip Title: {clip_data.get('clip_title', 'Unknown')}
    Start Time: {clip_data.get('start_time', 0)} seconds
    End Time: {clip_data.get('end_time', 0)} seconds
    Duration: {duration} seconds

    Transcript Content:
    {transcript_content}

    Generate 3-5 specific search queries that would find relevant B-roll images for this content.
    For each query, specify the exact timestamp within the clip where it would be most relevant.

    Focus on:
    - Key people, places, or concepts mentioned
    - Visual metaphors or illustrations
    - Current events or topics discussed
    - Products, companies, or brands mentioned

    Return a JSON array with this structure:
    [
        {{
            "query": "specific search query for Google Images",
            "timestamp_in_clip": 5.2,
            "relevance_reason": "why this image is relevant at this moment"
        }}
    ]

    Ensure timestamps are between 0 and {duration} seconds.
    Make queries specific and descriptive for better image search results.
    """

    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "system",
                    "content": "You are an expert video editor specializing in finding relevant B-roll content for social media clips. Generate specific, searchable queries that will find compelling visual content.",
                },
                {"role": "user", "content": prompt},
            ],
            temperature=0.3,
        )

        response_text = response.choices[0].message.content

        # Extract JSON from response
        if "```json" in response_text and "```" in response_text.split("```json", 1)[1]:
            json_text = response_text.split("```json", 1)[1].split("```", 1)[0]
            queries = json.loads(json_text)
        else:
            queries = json.loads(response_text)

        print(f"Generated {len(queries)} B-roll queries")
        return queries

    except Exception as e:
        print(f"Error generating B-roll queries: {str(e)}")
        return []


def search_google_images(
    query: str, api_key: str, search_engine_id: str, num_results: int = 3
) -> List[Dict]:
    """
    Search Google Images using Custom Search API

    Args:
        query: Search query string
        api_key: Google API key
        search_engine_id: Google Custom Search Engine ID
        num_results: Number of results to return

    Returns:
        List of image result dictionaries
    """
    try:
        url = "https://www.googleapis.com/customsearch/v1"
        params = {
            "key": api_key,
            "cx": search_engine_id,
            "q": query,
            "searchType": "image",
            "num": num_results,
            "safe": "active",
            "imgSize": "large",
            "imgType": "photo",
        }

        response = requests.get(url, params=params)
        response.raise_for_status()

        data = response.json()
        results = []

        for item in data.get("items", []):
            result = {
                "title": item.get("title", ""),
                "image_url": item.get("link", ""),
                "thumbnail_url": item.get("image", {}).get("thumbnailLink", ""),
                "context_url": item.get("image", {}).get("contextLink", ""),
                "width": item.get("image", {}).get("width", 0),
                "height": item.get("image", {}).get("height", 0),
                "file_size": item.get("image", {}).get("byteSize", 0),
            }
            results.append(result)

        return results

    except Exception as e:
        print(f"Error searching Google Images for query '{query}': {str(e)}")
        return []


def process_broll_generation(
    transcript_data: List,
    analysis_text: str,
    google_api_key: str = None,
    search_engine_id: str = None,
) -> List[Dict]:
    """
    Main processing function to generate B-roll content for social media clips

    Args:
        transcript_data: Full transcript data from TranscriptProcessor (list of TranscriptSegment objects or dicts)
        analysis_text: The formatted analysis output from get_initial_analysis
        google_api_key: Google API key for image search
        search_engine_id: Google Custom Search Engine ID

    Returns:
        List of processed clips with B-roll suggestions
    """
    try:
        print("Starting B-roll generation process")
        print(f"Transcript data type: {type(transcript_data)}, length: {len(transcript_data) if transcript_data else 0}")
        print(f"Analysis text length: {len(analysis_text) if analysis_text else 0}")
        
        # Initialize OpenAI client
        client = OpenAI()

        # Extract clips from analysis text
        social_clips = extract_clips_from_analysis(analysis_text)

        if not social_clips:
            print("No clips found in analysis text")
            return []

        processed_clips = []

        for i, clip in enumerate(social_clips, 1):
            print(f"Processing clip {i}/{len(social_clips)}: {clip.get('clip_title', 'Unknown')}")
            start_time = clip.get("start_time", 0)
            end_time = clip.get("end_time", 0)

            # Extract relevant transcript content
            transcript_content = extract_transcript_content(
                transcript_data, start_time, end_time
            )

            if not transcript_content:
                print(f"No transcript content found for clip {start_time}-{end_time}")
                processed_clips.append(
                    {
                        **clip,
                        "broll_suggestions": [],
                        "error": "No transcript content found",
                    }
                )
                continue

            # Generate B-roll queries
            broll_queries = generate_broll_queries(client, transcript_content, clip)

            broll_suggestions = []

            for j, query_data in enumerate(broll_queries, 1):
                print(f"Processing query {j}/{len(broll_queries)}: {query_data.get('query', 'Unknown')}")
                query = query_data.get("query", "")
                timestamp = query_data.get("timestamp_in_clip", 0)
                reason = query_data.get("relevance_reason", "")

                if not query:
                    continue

                # Search Google Images if API is available
                images = []
                if google_api_key and search_engine_id:
                    print(f"Searching Google Images for: {query}")
                    images = search_google_images(
                        query, google_api_key, search_engine_id
                    )
                    print(f"Found {len(images)} images")
                else:
                    print("Skipping Google Images search (no API credentials)")

                broll_suggestion = {
                    "query": query,
                    "timestamp_in_clip": timestamp,
                    "absolute_timestamp": start_time + timestamp,
                    "relevance_reason": reason,
                    "images": images,
                }
                broll_suggestions.append(broll_suggestion)

            processed_clip = {
                **clip,
                "transcript_content": transcript_content,
                "broll_suggestions": broll_suggestions,
            }
            processed_clips.append(processed_clip)
            print(f"Completed processing clip {i}, found {len(broll_suggestions)} suggestions")

        print(f"B-roll generation complete. Processed {len(processed_clips)} clips")
        return processed_clips

    except Exception as e:
        print(f"Error in process_broll_generation: {str(e)}")
        raise e


def format_broll_output(processed_clips: List[Dict]) -> str:
    """
    Format B-roll suggestions for display in the chat interface

    Args:
        processed_clips: List of processed clips with B-roll suggestions

    Returns:
        Formatted string for display
    """
    if not processed_clips:
        return "No B-roll suggestions generated."

    output = ["🎬 B-Roll Suggestions\n"]

    for i, clip in enumerate(processed_clips, 1):
        title = clip.get("clip_title", "Unknown Clip")
        start_time = clip.get("start_time", 0)
        end_time = clip.get("end_time", 0)

        # Format time display
        start_min, start_sec = divmod(int(start_time), 60)
        end_min, end_sec = divmod(int(end_time), 60)

        output.append(f"\n{i}. {title}")
        output.append(f"Time: {start_min:02d}:{start_sec:02d} - {end_min:02d}:{end_sec:02d}")

        broll_suggestions = clip.get("broll_suggestions", [])

        if not broll_suggestions:
            output.append("No B-roll suggestions available for this clip.")
        else:
            for j, suggestion in enumerate(broll_suggestions, 1):
                query = suggestion.get("query", "")
                timestamp = suggestion.get("timestamp_in_clip", 0)
                images = suggestion.get("images", [])

                # Format timestamp within clip
                ts_min, ts_sec = divmod(int(timestamp), 60)

                output.append(f"  Query {j}: {query}")
                output.append(f"  At: {ts_min:02d}:{ts_sec:02d}")

                # Show top 2 image links only
                if images:
                    top_images = images[:2]
                    for k, img in enumerate(top_images, 1):
                        img_url = img.get("image_url", "")
                        img_title = img.get("title", "Image")
                        if img_url:
                            output.append(f"    Link {k}: {img_title[:50]} - {img_url}")
                else:
                    output.append("    No images found for this query.")

        output.append("")

    return "\n".join(output)