Spaces:

awacke1
/

GPT-4o-omni-text-audio-image-video

Running

App Files Files Community

awacke1 commited on May 14, 2024

Commit

d907b5f

verified ·

1 Parent(s): d7cecbd

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -14

app.py CHANGED Viewed

@@ -67,21 +67,60 @@ def process_audio(audio_input):
         )
         st.markdown(response.choices[0].message.content)
 def process_video(video_input):
-    if video_input:
-        base64Frames, audio_path = process_video_frames(video_input)
-        transcription = openai.Audio.transcriptions.create(
-            model="whisper-1",
-            file=open(audio_path, "rb"),
-        )
-        frames_text = " ".join([f"[image: data:image/jpg;base64,{frame}]" for frame in base64Frames])
-        response = openai.Completion.create(
-            model=MODEL,
-            prompt=f"You are generating a video summary. Create a summary of the provided video and its transcript. Respond in Markdown. These are the frames from the video. {frames_text} The audio transcription is: {transcription['text']}",
-            max_tokens=500,
-            temperature=0.5,
-        )
-        st.markdown(response.choices[0].text.strip())
 def process_video_frames(video_path, seconds_per_frame=2):
     base64Frames = []

         )
         st.markdown(response.choices[0].message.content)
+def process_video(video_path, seconds_per_frame=2):
+    base64Frames = []
+    base_video_path, _ = os.path.splitext(video_path)
+    video = cv2.VideoCapture(video_path)
+    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+    fps = video.get(cv2.CAP_PROP_FPS)
+    frames_to_skip = int(fps * seconds_per_frame)
+    curr_frame=0
+    # Loop through the video and extract frames at specified sampling rate
+    while curr_frame < total_frames - 1:
+        video.set(cv2.CAP_PROP_POS_FRAMES, curr_frame)
+        success, frame = video.read()
+        if not success:
+            break
+        _, buffer = cv2.imencode(".jpg", frame)
+        base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
+        curr_frame += frames_to_skip
+    video.release()
+    # Extract audio from video
+    audio_path = f"{base_video_path}.mp3"
+    clip = VideoFileClip(video_path)
+    clip.audio.write_audiofile(audio_path, bitrate="32k")
+    clip.audio.close()
+    clip.close()
+    print(f"Extracted {len(base64Frames)} frames")
+    print(f"Extracted audio to {audio_path}")
+    return base64Frames, audio_path
+# Extract 1 frame per second. You can adjust the `seconds_per_frame` parameter to change the sampling rate
+base64Frames, audio_path = process_video(VIDEO_PATH, seconds_per_frame=1)
+## Generate a summary with visual and audio
 def process_video(video_input):
+    base64Frames, audio_path = process_video(video_input, seconds_per_frame=1)
+    response = client.chat.completions.create(
+        model=MODEL,
+        messages=[
+        {"role": "system", "content":"""You are generating a video summary. Create a summary of the provided video and its transcript. Respond in Markdown"""},
+        {"role": "user", "content": [
+            "These are the frames from the video.",
+            *map(lambda x: {"type": "image_url",
+                            "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames),
+            {"type": "text", "text": f"The audio transcription is: {transcription.text}"}
+            ],
+        }
+        ],
+        temperature=0,
+    )
+    st.markdown(response.choices[0].message.content)
 def process_video_frames(video_path, seconds_per_frame=2):
     base64Frames = []