File size: 4,741 Bytes
3f5355b
 
 
 
 
 
 
89ebf9d
3f5355b
91e2ce6
3f5355b
 
91e2ce6
3f5355b
 
 
 
 
 
 
0557c7d
91e2ce6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b71920e
3f5355b
 
 
 
 
 
 
 
 
 
 
 
 
8429ef6
3f5355b
 
 
 
 
 
 
 
8429ef6
 
 
 
 
 
 
 
 
3f5355b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7bfc54b
3f5355b
 
 
 
 
ec0deea
3f5355b
 
 
 
b71920e
ec0deea
3f5355b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89ebf9d
3f5355b
89ebf9d
 
3f5355b
89ebf9d
 
 
 
 
 
 
3f5355b
89ebf9d
3f5355b
89ebf9d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import streamlit as st
import os
import cv2
import torch
from PIL import Image
from gtts import gTTS
from scenedetect import open_video, SceneManager, ContentDetector
from transformers import BlipProcessor, BlipForConditionalGeneration
from openai import OpenAI
import base64
# Load AI models
caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
client = OpenAI(
  api_key="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"
)


# Streamlit App UI
st.title("πŸŽ₯ AI-Powered Video Summarization")
# Define custom CSS


def set_background(image_file):
    with open(image_file, "rb") as image:
        encoded_string = base64.b64encode(image.read()).decode()

    st.markdown(
        f"""
        <style>
        .stApp {{
            background-image: url("data:image/jpg;base64,{encoded_string}");
            background-size: cover;
            background-position: center;
            background-repeat: no-repeat;
        }}
        </style>
        """,
        unsafe_allow_html=True
    )

# Set background
set_background("yellow-wallpaper.jpg")
    

uploaded_file = st.file_uploader("πŸ“€ Upload a Video File", type=["mp4"])

if uploaded_file:
    video_path = "input_video.mp4"
    with open(video_path, "wb") as f:
        f.write(uploaded_file.getbuffer())

    st.video(video_path)

    # Scene Detection & Frame Extraction
    st.write("πŸ” Detecting scene changes and extracting key frames...")

    def extract_key_frames(video_path, output_folder="frames"):
        os.makedirs(output_folder, exist_ok=True)
        video = open_video(video_path)
        scene_manager = SceneManager()
        scene_manager.add_detector(ContentDetector(threshold=27.0))
        scene_manager.detect_scenes(video)
        scenes = scene_manager.get_scene_list()
        cap = cv2.VideoCapture(video_path)
        for i, (start, end) in enumerate(scenes):
            frame_time = start.get_frames()  # Extract frame at scene start
            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_time)
            ret, frame = cap.read()
            
            if ret:
                frame_path = os.path.join(output_folder, f"scene_{i+1}.jpg")
                cv2.imwrite(frame_path, frame)
                print(f"Saved: {frame_path}")

        cap.release()

    extract_key_frames(video_path)

    # Caption Generation
    st.write("πŸ“ Generating captions for extracted frames...")

    def generate_caption(image_path):
        image = Image.open(image_path).convert("RGB")
        inputs = caption_processor(image, return_tensors="pt")
        caption_ids = caption_model.generate(**inputs)
        return caption_processor.decode(caption_ids[0], skip_special_tokens=True)

    captions = []
    for filename in sorted(os.listdir("frames")):
        if filename.endswith(".jpg"):
            image_path = os.path.join("frames", filename)
            captions.append(generate_caption(image_path))

    # st.write("πŸ“„ Generated Captions:", captions)

    # Summarization
    st.write("πŸ“– Summarizing captions using AI...")

    def summarize_captions(captions):
      prompt = f"Summarize the following sequence of video frames into a meaningful story under 800 characters:\n\n{captions}"
      
      completion = client.chat.completions.create(
          model="gpt-4o-mini",
          messages=[{"role": "system", "content": "You are an AI that summarizes video content."},
                    {"role": "user", "content": prompt}],
          max_tokens=200
      )
      return completion.choices[0].message.content
        

    summary = summarize_captions(captions)
    st.write("πŸ“Œ Video Summary:", summary)

    # Text-to-Speech
    st.write("πŸ”Š Generating voice narration...")

    def text_to_speech(text, output_audio="summary_audio.mp3"):
        tts = gTTS(text, lang="en")
        tts.save(output_audio)
    
    text_to_speech(summary)
    st.audio('summary_audio.mp3')

    # # Combine Audio & Video
    # st.write("🎬 Merging audio with the video...")

    # def add_audio_to_video(video_path, audio_path, output_video="final_video.mp4"):
    #     video = mp.VideoFileClip(video_path)
    #     audio = mp.AudioFileClip(audio_path)
    #     if audio.duration > video.duration:
    #         audio = audio.subclip(0, video.duration)
    #     final_video = video.set_audio(audio)
    #     final_video.write_videofile(output_video, codec="libx264", audio_codec="aac")

    # add_audio_to_video(video_path, "summary_audio.mp3")

    # st.video("final_video.mp4")