videoanalysis / app.py
codelion's picture
Update app.py
7c2c622 verified
raw
history blame
6.95 kB
import os
import json
import gradio as gr
import cv2
import matplotlib.pyplot as plt # imported for compatibility if needed later
from collections import Counter
from google import genai
from google.genai import types
from google.genai.types import Part
from tenacity import retry, stop_after_attempt, wait_random_exponential
# Retrieve API key from environment variables.
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
if not GOOGLE_API_KEY:
raise ValueError("Please set the GOOGLE_API_KEY environment variable.")
# Initialize the Gemini API client via AI Studio using the API key.
client = genai.Client(api_key=GOOGLE_API_KEY)
# Use the Gemini 2.0 Flash model.
MODEL_NAME = "gemini-2.0-flash"
@retry(wait=wait_random_exponential(multiplier=1, max=60), stop=stop_after_attempt(3))
def call_gemini(video_url: str, prompt: str) -> str:
"""
Call the Gemini model with the provided video URL and prompt.
The video is passed as a URI part with MIME type "video/webm".
"""
response = client.models.generate_content(
model=MODEL_NAME,
contents=[
Part.from_uri(file_uri=video_url, mime_type="video/webm"),
prompt,
],
)
return response.text
def hhmmss_to_seconds(time_str: str) -> float:
"""
Convert a HH:MM:SS formatted string into seconds.
"""
parts = time_str.strip().split(":")
parts = [float(p) for p in parts]
if len(parts) == 3:
return parts[0]*3600 + parts[1]*60 + parts[2]
elif len(parts) == 2:
return parts[0]*60 + parts[1]
else:
return parts[0]
def get_key_frames(video_url: str, analysis: str, user_query: str) -> list:
"""
Prompt Gemini to return key frame timestamps (in HH:MM:SS) with descriptions,
then extract those frames from the video using OpenCV.
Returns a list of tuples: (image_array, caption)
"""
prompt = (
"Based on the following video analysis, identify key frames that best illustrate "
"the important events or anomalies. Return a JSON array where each element is an object "
"with two keys: 'timestamp' (in HH:MM:SS format) and 'description' (a brief explanation of why "
"this frame is important)."
)
prompt += f" Video Analysis: {analysis}"
if user_query:
prompt += f" Additional focus: {user_query}"
try:
key_frames_response = call_gemini(video_url, prompt)
# Attempt to parse the output as JSON.
key_frames = json.loads(key_frames_response)
if not isinstance(key_frames, list):
key_frames = []
except Exception as e:
key_frames = []
extracted_frames = []
cap = cv2.VideoCapture(video_url)
if not cap.isOpened():
print("Error: Could not open video.")
return extracted_frames
for frame_obj in key_frames:
ts = frame_obj.get("timestamp")
description = frame_obj.get("description", "")
try:
seconds = hhmmss_to_seconds(ts)
except Exception:
continue
# Set video position (in milliseconds)
cap.set(cv2.CAP_PROP_POS_MSEC, seconds * 1000)
ret, frame = cap.read()
if ret:
# Convert BGR to RGB
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
caption = f"{ts}: {description}"
extracted_frames.append((frame_rgb, caption))
cap.release()
return extracted_frames
def analyze_video(video_url: str, user_query: str) -> (str, list):
"""
Perform iterative, agentic video analysis.
First, refine the video analysis over several iterations.
Then, prompt the model to identify key frames.
Returns:
- A Markdown report as a string.
- A gallery list of key frames (each as a tuple of (image, caption)).
"""
analysis = ""
num_iterations = 3
for i in range(num_iterations):
base_prompt = (
"You are a video analysis agent focusing on security and surveillance. "
"Provide a detailed summary of the video, highlighting key events, suspicious activities, or anomalies."
)
if user_query:
base_prompt += f" Also, focus on the following query: {user_query}"
if i == 0:
prompt = base_prompt
else:
prompt = (
f"Based on the previous analysis: \"{analysis}\". "
"Provide further elaboration and refined insights, focusing on potential security threats, anomalous events, "
"and details that would help a security team understand the situation better."
)
if user_query:
prompt += f" Remember to focus on: {user_query}"
try:
analysis = call_gemini(video_url, prompt)
except Exception as e:
analysis += f"\n[Error during iteration {i+1}: {e}]"
break
# Create a Markdown report
markdown_report = f"## Video Analysis Report\n\n**Summary:**\n\n{analysis}\n"
# Get key frames based on the analysis and optional query.
key_frames_gallery = get_key_frames(video_url, analysis, user_query)
if not key_frames_gallery:
markdown_report += "\n*No key frames were extracted.*\n"
else:
markdown_report += "\n**Key Frames Extracted:**\n"
for idx, (img, caption) in enumerate(key_frames_gallery, start=1):
markdown_report += f"- **Frame {idx}:** {caption}\n"
return markdown_report, key_frames_gallery
def gradio_interface(video_url: str, user_query: str) -> (str, list):
"""
Gradio interface function that accepts a video URL and an optional query,
then returns a Markdown report and a gallery of key frame images with captions.
"""
if not video_url:
return "Please provide a valid video URL.", []
return analyze_video(video_url, user_query)
# Define the Gradio interface with two inputs and two outputs.
iface = gr.Interface(
fn=gradio_interface,
inputs=[
gr.Textbox(label="Video URL (publicly accessible, e.g., YouTube direct link or video file URL)"),
gr.Textbox(label="Analysis Query (optional): guide the focus of the analysis", placeholder="e.g., focus on unusual movements near the entrance")
],
outputs=[
gr.Markdown(label="Security & Surveillance Analysis Report"),
gr.Gallery(label="Extracted Key Frames").style(grid=[2], height="auto")
],
title="AI Video Analysis and Summariser Agent",
description=(
"This agentic video analysis tool uses Google's Gemini 2.0 Flash model via AI Studio "
"to iteratively analyze a video for security and surveillance insights. Provide a video URL and, optionally, "
"a query to guide the analysis. The tool returns a detailed Markdown report along with a gallery of key frame images."
)
)
if __name__ == "__main__":
iface.launch()