File size: 4,338 Bytes
3f5355b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import streamlit as st
import os
import cv2
import torch
import moviepy.editor as mp
from PIL import Image
from gtts import gTTS
from scenedetect import open_video, SceneManager, ContentDetector
from transformers import BlipProcessor, BlipForConditionalGeneration, AutoModelForCausalLM, AutoTokenizer
from openai import OpenAI
# Load AI models
caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
client = OpenAI(
  api_key="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"
)


# Streamlit App UI
st.title("πŸŽ₯ AI-Powered Video Summarization")

uploaded_file = st.file_uploader("πŸ“€ Upload a Video File", type=["mp4"])

if uploaded_file:
    video_path = "input_video.mp4"
    with open(video_path, "wb") as f:
        f.write(uploaded_file.getbuffer())

    st.video(video_path)

    # Scene Detection & Frame Extraction
    st.write("πŸ” Detecting scene changes and extracting key frames...")

    def extract_key_frames(video_path, output_folder="frames", frames_per_scene=3):
        os.makedirs(output_folder, exist_ok=True)
        video = open_video(video_path)
        scene_manager = SceneManager()
        scene_manager.add_detector(ContentDetector(threshold=27.0))
        video.set_downscale_factor()
        scene_manager.detect_scenes(video)
        scenes = scene_manager.get_scene_list()
        cap = cv2.VideoCapture(video_path)
        for i, (start, end) in enumerate(scenes):
            start_frame = start.get_frames()
            end_frame = end.get_frames()
            step = (end_frame - start_frame) // (frames_per_scene + 1)
            for j in range(frames_per_scene):
                frame_time = start_frame + step * (j + 1)
                cap.set(cv2.CAP_PROP_POS_FRAMES, frame_time)
                ret, frame = cap.read()
                if ret:
                    frame_path = os.path.join(output_folder, f"scene_{i+1}_frame{j+1}.jpg")
                    cv2.imwrite(frame_path, frame)
        cap.release()

    extract_key_frames(video_path)

    # Caption Generation
    st.write("πŸ“ Generating captions for extracted frames...")

    def generate_caption(image_path):
        image = Image.open(image_path).convert("RGB")
        inputs = caption_processor(image, return_tensors="pt")
        caption_ids = caption_model.generate(**inputs)
        return caption_processor.decode(caption_ids[0], skip_special_tokens=True)

    captions = []
    for filename in sorted(os.listdir("frames")):
        if filename.endswith(".jpg"):
            image_path = os.path.join("frames", filename)
            captions.append(generate_caption(image_path))

    st.write("πŸ“„ Generated Captions:", captions)

    # Summarization
    st.write("πŸ“– Summarizing captions using AI...")

    def summarize_captions(captions):
      prompt = f"Summarize the following sequence of video frames into a meaningful story:\n\n{captions}"
      
      completion = client.chat.completions.create(
          model="gpt-4o-mini",
          messages=[{"role": "system", "content": "You are an AI that summarizes video content."},
                    {"role": "user", "content": prompt}]
      )
      return completion.choices[0].message.content
        

    summary = summarize_captions(captions)
    st.write("πŸ“Œ Video Summary:", summary)

    # Text-to-Speech
    st.write("πŸ”Š Generating voice narration...")

    def text_to_speech(text, output_audio="summary_audio.mp3"):
        tts = gTTS(text, lang="en")
        tts.save(output_audio)
    
    text_to_speech(summary)

    # Combine Audio & Video
    st.write("🎬 Merging audio with the video...")

    def add_audio_to_video(video_path, audio_path, output_video="final_video.mp4"):
        video = mp.VideoFileClip(video_path)
        audio = mp.AudioFileClip(audio_path)
        if audio.duration > video.duration:
            audio = audio.subclip(0, video.duration)
        final_video = video.set_audio(audio)
        final_video.write_videofile(output_video, codec="libx264", audio_codec="aac")

    add_audio_to_video(video_path, "summary_audio.mp3")

    st.video("final_video.mp4")