Michele De Stefano
Now it is possible to process questions incrementally
b066853
import json
import os
import cv2
from transformers import BlipProcessor, BlipForConditionalGeneration
# model_id = "Salesforce/blip-image-captioning-base"
model_id = "Salesforce/blip-image-captioning-large"
captioning_processor = BlipProcessor.from_pretrained(model_id)
captioning_model = BlipForConditionalGeneration.from_pretrained(model_id)
def extract_frames(video_path, output_folder, interval_ms=2000) -> None:
"""
Extracts frames from a video into an output folder at a specified time
interval. Frames are saved as *.jpg images.
Args:
video_path: The file name of the video to sample.
output_folder: The output directory for the extracted frames.
interval_ms: The sampling interval in milliseconds.
NOTE: No anti-aliasing filter is applied.
"""
if not os.path.exists(output_folder):
os.makedirs(output_folder)
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS) # Get fps
# Compute sampling interval in number of frames to skip
interval_frames = int(fps * interval_ms * 0.001)
frame_count = 0
saved_frame_count = 0
while True:
ret, frame = cap.read()
if not ret:
break
# Keep only selected frames
if frame_count % interval_frames == 0:
frame_filename = os.path.join(
output_folder,
f"frame_{saved_frame_count:04d}.jpg"
)
cv2.imwrite(frame_filename, frame)
saved_frame_count += 1
frame_count += 1
cap.release()
def extract_frame_captions(
video_path,
interval_ms=2000
) -> str:
"""
Extracts frame captions from a video at a specified time
interval.
Args:
video_path: The file name of the video to sample.
interval_ms: The sampling interval in milliseconds.
NOTE: No anti-aliasing filter is applied.
Returns:
Frame descriptions as a list of strings.
"""
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS) # Get fps
# Compute sampling interval in number of frames to skip
interval_frames = int(fps * interval_ms * 0.001)
frame_count = 0
saved_frame_count = 0
captions = []
while True:
ret, frame = cap.read()
if not ret:
break
# Keep only selected frames
if frame_count % interval_frames == 0:
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
inputs = captioning_processor(
frame,
return_tensors="pt"
)
out = captioning_model.generate(**inputs)
cur_caption = (
captioning_processor.decode(out[0], skip_special_tokens=True)
)
captions += [cur_caption]
saved_frame_count += 1
frame_count += 1
cap.release()
return json.dumps(captions)