Spaces:
Running
Running
File size: 4,741 Bytes
3f5355b 89ebf9d 3f5355b 91e2ce6 3f5355b 91e2ce6 3f5355b 0557c7d 91e2ce6 b71920e 3f5355b 8429ef6 3f5355b 8429ef6 3f5355b 7bfc54b 3f5355b ec0deea 3f5355b b71920e ec0deea 3f5355b 89ebf9d 3f5355b 89ebf9d 3f5355b 89ebf9d 3f5355b 89ebf9d 3f5355b 89ebf9d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import streamlit as st
import os
import cv2
import torch
from PIL import Image
from gtts import gTTS
from scenedetect import open_video, SceneManager, ContentDetector
from transformers import BlipProcessor, BlipForConditionalGeneration
from openai import OpenAI
import base64
# Load AI models
caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
client = OpenAI(
api_key="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"
)
# Streamlit App UI
st.title("π₯ AI-Powered Video Summarization")
# Define custom CSS
def set_background(image_file):
with open(image_file, "rb") as image:
encoded_string = base64.b64encode(image.read()).decode()
st.markdown(
f"""
<style>
.stApp {{
background-image: url("data:image/jpg;base64,{encoded_string}");
background-size: cover;
background-position: center;
background-repeat: no-repeat;
}}
</style>
""",
unsafe_allow_html=True
)
# Set background
set_background("yellow-wallpaper.jpg")
uploaded_file = st.file_uploader("π€ Upload a Video File", type=["mp4"])
if uploaded_file:
video_path = "input_video.mp4"
with open(video_path, "wb") as f:
f.write(uploaded_file.getbuffer())
st.video(video_path)
# Scene Detection & Frame Extraction
st.write("π Detecting scene changes and extracting key frames...")
def extract_key_frames(video_path, output_folder="frames"):
os.makedirs(output_folder, exist_ok=True)
video = open_video(video_path)
scene_manager = SceneManager()
scene_manager.add_detector(ContentDetector(threshold=27.0))
scene_manager.detect_scenes(video)
scenes = scene_manager.get_scene_list()
cap = cv2.VideoCapture(video_path)
for i, (start, end) in enumerate(scenes):
frame_time = start.get_frames() # Extract frame at scene start
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_time)
ret, frame = cap.read()
if ret:
frame_path = os.path.join(output_folder, f"scene_{i+1}.jpg")
cv2.imwrite(frame_path, frame)
print(f"Saved: {frame_path}")
cap.release()
extract_key_frames(video_path)
# Caption Generation
st.write("π Generating captions for extracted frames...")
def generate_caption(image_path):
image = Image.open(image_path).convert("RGB")
inputs = caption_processor(image, return_tensors="pt")
caption_ids = caption_model.generate(**inputs)
return caption_processor.decode(caption_ids[0], skip_special_tokens=True)
captions = []
for filename in sorted(os.listdir("frames")):
if filename.endswith(".jpg"):
image_path = os.path.join("frames", filename)
captions.append(generate_caption(image_path))
# st.write("π Generated Captions:", captions)
# Summarization
st.write("π Summarizing captions using AI...")
def summarize_captions(captions):
prompt = f"Summarize the following sequence of video frames into a meaningful story under 800 characters:\n\n{captions}"
completion = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "system", "content": "You are an AI that summarizes video content."},
{"role": "user", "content": prompt}],
max_tokens=200
)
return completion.choices[0].message.content
summary = summarize_captions(captions)
st.write("π Video Summary:", summary)
# Text-to-Speech
st.write("π Generating voice narration...")
def text_to_speech(text, output_audio="summary_audio.mp3"):
tts = gTTS(text, lang="en")
tts.save(output_audio)
text_to_speech(summary)
st.audio('summary_audio.mp3')
# # Combine Audio & Video
# st.write("π¬ Merging audio with the video...")
# def add_audio_to_video(video_path, audio_path, output_video="final_video.mp4"):
# video = mp.VideoFileClip(video_path)
# audio = mp.AudioFileClip(audio_path)
# if audio.duration > video.duration:
# audio = audio.subclip(0, video.duration)
# final_video = video.set_audio(audio)
# final_video.write_videofile(output_video, codec="libx264", audio_codec="aac")
# add_audio_to_video(video_path, "summary_audio.mp3")
# st.video("final_video.mp4")
|