Spaces:
Sleeping
Sleeping
# -*- coding: utf-8 -*- | |
import gradio as gr | |
from typing import Optional, List | |
from pydantic import BaseModel, Field | |
from phi.agent import Agent | |
from phi.model.google import Gemini | |
from phi.workflow import Workflow, RunResponse, RunEvent | |
from phi.storage.workflow.sqlite import SqlWorkflowStorage | |
from phi.tools.duckduckgo import DuckDuckGo | |
from phi.utils.pprint import pprint_run_response | |
from phi.utils.log import logger | |
from agent_tools import ( | |
object_detection_embed, | |
summarize_video, | |
) | |
from utils import ( | |
create_poster, | |
download_youtube_video, | |
generate_tmp_filename, | |
pdf_to_jpg, | |
) | |
import os | |
from PIL import Image | |
import numpy as np | |
# Output language | |
LANG_OPTIONS = [ | |
"Original", | |
"Chinese", | |
"English", | |
] | |
#==================================================================================== | |
class Video(BaseModel): | |
name: str = Field(..., description="File name of the video.") | |
url: str = Field(..., description="Link to the video.") | |
summary: Optional[str] = Field(..., description="Summary of the video.") | |
hash_value: Optional[str] = Field(..., description="sha256_hash value of the video.") | |
class VideoCache: | |
def __init__(self): | |
self.session_state = gr.State({ | |
'metadata': {}, # For summaries | |
'frame_data': {} # For image arrays (serialized) | |
}) | |
def add_to_cache(self, basename: str, out_lang: str, summary: str, frames: list[np.ndarray]): | |
"""Store both summary and frames properly""" | |
key = basename + '_' + out_lang | |
# Convert numpy arrays to bytes | |
serialized_frames = [self._array_to_bytes(arr) for arr in frames] | |
# Update cache | |
current_state = self.session_state.value | |
current_state['metadata'][key] = summary | |
current_state['frame_data'][key] = serialized_frames | |
self.session_state.value = current_state | |
def get_from_cache(self, basename: str, out_lang: str) -> tuple: | |
"""Retrieve both summary and frames""" | |
key = basename + '_' + out_lang | |
cache = self.session_state.value | |
summary = cache['metadata'].get(key) | |
frame_bytes = cache['frame_data'].get(key, []) | |
# Convert bytes back to arrays | |
frames = [self._bytes_to_array(*b) for b in frame_bytes] | |
return summary, frames | |
def _array_to_bytes(arr: np.ndarray) -> tuple: | |
"""Convert array to (bytes, shape)""" | |
return arr.tobytes(), arr.shape | |
def _bytes_to_array(b: bytes, shape: tuple) -> np.ndarray: | |
"""Reconstruct array from (bytes, shape)""" | |
return np.frombuffer(b, dtype=np.uint8).reshape(shape) | |
class VideoPosterGenerator(Workflow): | |
# Define an Agent that will load video clip | |
loader: Agent = Agent( | |
tools=[download_youtube_video], | |
show_tool_calls=True, | |
description="Given a url_link, load video to process.", | |
) | |
# Define an Agent that will summarize video | |
summarizer: Agent = Agent( | |
tools=[summarize_video], | |
show_tool_calls=True, | |
markdown=True, | |
description="Given a video, answer the prompt questions.", | |
) | |
# Define an Agent that will extract top three object images | |
detector: Agent = Agent( | |
tools=[object_detection_embed], | |
show_tool_calls=True, | |
structured_outputs=True, | |
description="Given a video, extract top three object images.", | |
) | |
# Define an Agent that will generate a poster | |
poster: Agent = Agent( | |
tools=[create_poster], | |
show_tool_calls=True, | |
structured_outputs=True, | |
description="Given summary and images, generate one page postes.", | |
) | |
def run(self, url: str, user_prompt: str, out_lang: str, use_cache: bool = True) -> RunResponse: | |
"""This is where the main logic of the workflow is implemented.""" | |
logger.info(f"Generating a poster for video: {url}") | |
basename = os.path.basename(url) | |
pdf_name = generate_tmp_filename(basename, ".pdf") | |
# Step 1: Use the cached video poster if use_cache is True | |
if use_cache: | |
summary, objects = video_cache.get_from_cache(basename, out_lang) | |
if summary is not None and objects is not None: | |
logger.info(f"found cached_video_content: {url}") | |
poster_response: Optional[poster] = create_poster(pdf_name, objects, out_lang, summary, url) | |
if poster_response is None: | |
return RunResponse( | |
event=RunEvent.workflow_completed, | |
content=f"Failed to generate video poster, please try again!", | |
) | |
else: | |
logger.info(f"Poster is generated sucessfully.") | |
return RunResponse( | |
event=RunEvent.workflow_completed, | |
content=[None, poster_response], | |
) | |
# Step 2: load video for the given url | |
video_response: Optional[loader] = download_youtube_video(url) | |
# If the video is not loaded sucessfully, end the workflow | |
if video_response is None: | |
return RunResponse( | |
event=RunEvent.workflow_completed, | |
content=f"Sorry, could not load the video: {url}", | |
) | |
else: | |
logger.info(f"Video {url} is loaded.") | |
video_path = video_response | |
# Step 3: summalize the video for the given questions | |
summary_response: Optional[summarizer] = summarize_video(video_path, user_prompt, out_lang) | |
# If the summary is not generated, end the workflow | |
if summary_response is None: | |
return RunResponse( | |
event=RunEvent.workflow_completed, | |
content=f"Failed to get summary, please try again!", | |
) | |
else: | |
logger.info(f"Video summary is generated.") | |
lang, summary = summary_response | |
# Step 4: extract top 3 object(person or other) images | |
images_response: Optional[detector] = object_detection_embed(video_path) | |
# If objects are not detected sucessfully, end the workflow | |
if images_response is None: | |
return RunResponse( | |
event=RunEvent.workflow_completed, | |
content=f"Failed to extract images, please try again!", | |
) | |
else: | |
logger.info(f"Objects are extracted sucessfully.") | |
objects = images_response | |
# Step 5: generate video poster | |
poster_response: Optional[poster] = create_poster(pdf_name, objects, lang, summary, url) | |
if poster_response is None: | |
return RunResponse( | |
event=RunEvent.workflow_completed, | |
content=f"Failed to generate video poster, please try again!", | |
) | |
else: | |
logger.info(f"Poster is generated sucessfully.") | |
# Store in cache | |
video_cache.add_to_cache(basename=basename, out_lang=out_lang, summary=summary, frames=objects) | |
return RunResponse( | |
event=RunEvent.workflow_completed, | |
content=[video_path, poster_response], | |
) | |
#===================================================================================== | |
# Combine outputs of face detection and video summary to generate a single page paster | |
def generate_poster_2(url, user_prompt, out_lang): | |
url_base_name = os.path.basename(url) | |
jpg_name = generate_tmp_filename(url_base_name, ".jpg") | |
# Initialize the poster generator workflow | |
# - Creates a unique session ID based on the video url | |
# - Sets up SQLite storage for caching results | |
poster = VideoPosterGenerator( | |
session_id=f"generate-poster-on-{url}", | |
storage=SqlWorkflowStorage( | |
table_name="generate_poster_workflows", | |
db_file="tmp/workflows.db", | |
), | |
) | |
# Execute the workflow with caching enabled | |
# Returns an iterator of RunResponse objects containing the generated content | |
video_path, video_poster = poster.run(url=url, user_prompt=user_prompt, out_lang=out_lang, use_cache=True).content | |
logger.info(f"video_poster: {video_poster}") | |
poster_jpg = pdf_to_jpg(video_poster, jpg_name) | |
return video_path, video_poster, jpg_name | |
#================================================================================== | |
# Gradio interface | |
print("Setting up Gradio interface...") | |
with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
video_cache = VideoCache() | |
gr.Markdown( | |
""" | |
# 🎥 Video Smart Summary - From Video to Poster with Multimodal Agent | |
Provide a YouTube or other video url to get an AI-generated summary poster. | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(scale = 5, variant = "compact"): | |
url_input = gr.Textbox(label="Paste YouTube URL here", | |
placeholder="https://www.youtube.com/shorts/AE5HZsZOlkY", | |
value="https://www.youtube.com/shorts/AE5HZsZOlkY") | |
video_input = gr.Video(label="Downloaded Video", height = 300, scale = 5) | |
with gr.Column(scale = 5, variant = "compact"): | |
lang_name = gr.Dropdown( | |
choices=LANG_OPTIONS, | |
value=LANG_OPTIONS[0], | |
label="Output Language", | |
interactive = True, | |
) | |
user_prompt = gr.Textbox(label="📊 User Prompt", | |
value= | |
f'''0. **Title**: Summarize this video in one sentence with no more than 8 words. | |
1. **Story:** How the set scene introduced and tone is set. What is happening in the scene? Describe key visuals and actions. | |
2. **Characters**: Identify top three character, noting their expressions, attire, actions, and interactions. Highlight emotional nuances and gestures. | |
3. **Narration or Voiceover**: Describe what types of narrations or voiceovers are used in the video. | |
4. **Mood and Tone**: Capture the overall mood and tone of each scene, mentioning any music or sound effects that enhance these elements.''', | |
placeholder="Ask anything about the video - AI Agent will analyze everything and search the web if needed", | |
info="You can ask questions about the video content", | |
max_lines=30, | |
interactive = True) | |
with gr.Row(): | |
poster_button = gr.Button("🚀 Generate Poster", variant="primary") | |
with gr.Row(): | |
with gr.Column(scale = 6, variant = "compact"): | |
jpg_file = gr.Image(label="Generated Poster Image", type = "filepath") | |
with gr.Column(scale = 4, variant = "compact"): | |
pdf_file = gr.File(label="Generated Poster PDF", file_types=[".pdf"]) | |
gr.Markdown( | |
""" | |
### How to use: | |
1. Paste a YouTube link in the URL input textbox; | |
2. Select output language you want to use, currently only support original(default, no translation), English and Chinese; | |
3. Modify you prompt questions if you want (optional); | |
4. Click the primary task button "Generate Poster"; | |
5. Downalod generated poster (JPG or PDF) file from ‘Generated Poster ...’ block. | |
*Note: Processing may take a few minutes depending on the video length.* | |
*If you get error for some reason, retry it before debug it!* | |
""" | |
) | |
# actions | |
poster_button.click(generate_poster_2, inputs=[url_input, user_prompt, lang_name], outputs=[video_input, pdf_file, jpg_file]) | |
demo.launch(share=True) | |