File size: 4,330 Bytes
3f5355b
 
 
 
89ebf9d
3f5355b
 
 
89ebf9d
3f5355b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89ebf9d
3f5355b
89ebf9d
 
3f5355b
89ebf9d
 
 
 
 
 
 
3f5355b
89ebf9d
3f5355b
89ebf9d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import streamlit as st
import os
import cv2
import torch

from PIL import Image
from gtts import gTTS
from scenedetect import open_video, SceneManager, ContentDetector
from transformers import BlipProcessor, BlipForConditionalGeneration
from openai import OpenAI
# Load AI models
caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
client = OpenAI(
  api_key="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"
)


# Streamlit App UI
st.title("πŸŽ₯ AI-Powered Video Summarization")

uploaded_file = st.file_uploader("πŸ“€ Upload a Video File", type=["mp4"])

if uploaded_file:
    video_path = "input_video.mp4"
    with open(video_path, "wb") as f:
        f.write(uploaded_file.getbuffer())

    st.video(video_path)

    # Scene Detection & Frame Extraction
    st.write("πŸ” Detecting scene changes and extracting key frames...")

    def extract_key_frames(video_path, output_folder="frames", frames_per_scene=3):
        os.makedirs(output_folder, exist_ok=True)
        video = open_video(video_path)
        scene_manager = SceneManager()
        scene_manager.add_detector(ContentDetector(threshold=27.0))
        video.set_downscale_factor()
        scene_manager.detect_scenes(video)
        scenes = scene_manager.get_scene_list()
        cap = cv2.VideoCapture(video_path)
        for i, (start, end) in enumerate(scenes):
            start_frame = start.get_frames()
            end_frame = end.get_frames()
            step = (end_frame - start_frame) // (frames_per_scene + 1)
            for j in range(frames_per_scene):
                frame_time = start_frame + step * (j + 1)
                cap.set(cv2.CAP_PROP_POS_FRAMES, frame_time)
                ret, frame = cap.read()
                if ret:
                    frame_path = os.path.join(output_folder, f"scene_{i+1}_frame{j+1}.jpg")
                    cv2.imwrite(frame_path, frame)
        cap.release()

    extract_key_frames(video_path)

    # Caption Generation
    st.write("πŸ“ Generating captions for extracted frames...")

    def generate_caption(image_path):
        image = Image.open(image_path).convert("RGB")
        inputs = caption_processor(image, return_tensors="pt")
        caption_ids = caption_model.generate(**inputs)
        return caption_processor.decode(caption_ids[0], skip_special_tokens=True)

    captions = []
    for filename in sorted(os.listdir("frames")):
        if filename.endswith(".jpg"):
            image_path = os.path.join("frames", filename)
            captions.append(generate_caption(image_path))

    st.write("πŸ“„ Generated Captions:", captions)

    # Summarization
    st.write("πŸ“– Summarizing captions using AI...")

    def summarize_captions(captions):
      prompt = f"Summarize the following sequence of video frames into a meaningful story:\n\n{captions}"
      
      completion = client.chat.completions.create(
          model="gpt-4o-mini",
          messages=[{"role": "system", "content": "You are an AI that summarizes video content."},
                    {"role": "user", "content": prompt}]
      )
      return completion.choices[0].message.content
        

    summary = summarize_captions(captions)
    st.write("πŸ“Œ Video Summary:", summary)

    # Text-to-Speech
    st.write("πŸ”Š Generating voice narration...")

    def text_to_speech(text, output_audio="summary_audio.mp3"):
        tts = gTTS(text, lang="en")
        tts.save(output_audio)
    
    text_to_speech(summary)
    st.audio('summary_audio.mp3')

    # # Combine Audio & Video
    # st.write("🎬 Merging audio with the video...")

    # def add_audio_to_video(video_path, audio_path, output_video="final_video.mp4"):
    #     video = mp.VideoFileClip(video_path)
    #     audio = mp.AudioFileClip(audio_path)
    #     if audio.duration > video.duration:
    #         audio = audio.subclip(0, video.duration)
    #     final_video = video.set_audio(audio)
    #     final_video.write_videofile(output_video, codec="libx264", audio_codec="aac")

    # add_audio_to_video(video_path, "summary_audio.mp3")

    # st.video("final_video.mp4")