Spaces:
Running
Running
File size: 4,639 Bytes
3f5355b 89ebf9d 3f5355b fbc045d 7bfc54b 3f5355b 7bfc54b 7e398e7 7bfc54b b71920e 7bfc54b b71920e 3f5355b 8429ef6 3f5355b 8429ef6 3f5355b 7bfc54b 3f5355b b71920e 3f5355b b71920e 3f5355b 89ebf9d 3f5355b 89ebf9d 3f5355b 89ebf9d 3f5355b 89ebf9d 3f5355b 89ebf9d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import streamlit as st
import os
import cv2
import torch
from PIL import Image
from gtts import gTTS
from scenedetect import open_video, SceneManager, ContentDetector
from transformers import BlipProcessor, BlipForConditionalGeneration
from openai import OpenAI
# Load AI models
caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cpu")
caption_model = torch.quantization.quantize_dynamic(caption_model, {torch.nn.Linear}, dtype=torch.qint8)
client = OpenAI(
api_key="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"
)
# Streamlit App UI
st.title("π₯ AI-Powered Video Summarization")
st.markdown(
"""
<style>
.stApp {
background-image: url("https://s29938.pcdn.co/wp-content/uploads/2019/02/Wallpaper-Kemra-WhiteSubwayTile-4-1100x1320.jpg.optimal.jpg");
background-size: cover;
background-position: center;
background-repeat: no-repeat;
}
</style>
""",
unsafe_allow_html=True
)
uploaded_file = st.file_uploader("π€ Upload a Video File", type=["mp4"])
if uploaded_file:
video_path = "input_video.mp4"
with open(video_path, "wb") as f:
f.write(uploaded_file.getbuffer())
st.video(video_path)
# Scene Detection & Frame Extraction
st.write("π Detecting scene changes and extracting key frames...")
def extract_key_frames(video_path, output_folder="frames"):
os.makedirs(output_folder, exist_ok=True)
video = open_video(video_path)
scene_manager = SceneManager()
scene_manager.add_detector(ContentDetector(threshold=27.0))
scene_manager.detect_scenes(video)
scenes = scene_manager.get_scene_list()
cap = cv2.VideoCapture(video_path)
for i, (start, end) in enumerate(scenes):
frame_time = start.get_frames() # Extract frame at scene start
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_time)
ret, frame = cap.read()
if ret:
frame_path = os.path.join(output_folder, f"scene_{i+1}.jpg")
cv2.imwrite(frame_path, frame)
print(f"Saved: {frame_path}")
cap.release()
extract_key_frames(video_path)
# Caption Generation
st.write("π Generating captions for extracted frames...")
def generate_caption(image_path):
image = Image.open(image_path).convert("RGB")
inputs = caption_processor(image, return_tensors="pt")
caption_ids = caption_model.generate(**inputs)
return caption_processor.decode(caption_ids[0], skip_special_tokens=True)
captions = []
for filename in sorted(os.listdir("frames")):
if filename.endswith(".jpg"):
image_path = os.path.join("frames", filename)
captions.append(generate_caption(image_path))
# st.write("π Generated Captions:", captions)
# Summarization
st.write("π Summarizing captions using AI...")
def summarize_captions(captions):
prompt = f"Summarize the following sequence of video frames into a meaningful story under 500 characters:\n\n{captions}"
completion = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "system", "content": "You are an AI that summarizes video content."},
{"role": "user", "content": prompt}],
max_tokens=150
)
return completion.choices[0].message.content
summary = summarize_captions(captions)
st.write("π Video Summary:", summary)
# Text-to-Speech
st.write("π Generating voice narration...")
def text_to_speech(text, output_audio="summary_audio.mp3"):
tts = gTTS(text, lang="en")
tts.save(output_audio)
text_to_speech(summary)
st.audio('summary_audio.mp3')
# # Combine Audio & Video
# st.write("π¬ Merging audio with the video...")
# def add_audio_to_video(video_path, audio_path, output_video="final_video.mp4"):
# video = mp.VideoFileClip(video_path)
# audio = mp.AudioFileClip(audio_path)
# if audio.duration > video.duration:
# audio = audio.subclip(0, video.duration)
# final_video = video.set_audio(audio)
# final_video.write_videofile(output_video, codec="libx264", audio_codec="aac")
# add_audio_to_video(video_path, "summary_audio.mp3")
# st.video("final_video.mp4")
|