import json
import os
import cv2

from transformers import BlipProcessor, BlipForConditionalGeneration

# model_id = "Salesforce/blip-image-captioning-base"
model_id = "Salesforce/blip-image-captioning-large"
captioning_processor = BlipProcessor.from_pretrained(model_id)
captioning_model = BlipForConditionalGeneration.from_pretrained(model_id)


def extract_frames(video_path, output_folder, interval_ms=2000) -> None:
    """
    Extracts frames from a video into an output folder at a specified time
    interval. Frames are saved as *.jpg images.

    Args:
        video_path:     The file name of the video to sample.
        output_folder:  The output directory for the extracted frames.
        interval_ms:    The sampling interval in milliseconds.
                        NOTE: No anti-aliasing filter is applied.
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)  # Get fps
    # Compute sampling interval in number of frames to skip
    interval_frames = int(fps * interval_ms * 0.001)

    frame_count = 0
    saved_frame_count = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Keep only selected frames
        if frame_count % interval_frames == 0:
            frame_filename = os.path.join(
                output_folder,
                f"frame_{saved_frame_count:04d}.jpg"
            )
            cv2.imwrite(frame_filename, frame)
            saved_frame_count += 1

        frame_count += 1

    cap.release()


def extract_frame_captions(
    video_path,
    interval_ms=2000
) -> str:
    """
    Extracts frame captions from a video at a specified time
    interval.

    Args:
        video_path:     The file name of the video to sample.
        interval_ms:    The sampling interval in milliseconds.
                        NOTE: No anti-aliasing filter is applied.

    Returns:
        Frame descriptions as a list of strings.
    """
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)  # Get fps
    # Compute sampling interval in number of frames to skip
    interval_frames = int(fps * interval_ms * 0.001)

    frame_count = 0
    saved_frame_count = 0

    captions = []
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Keep only selected frames
        if frame_count % interval_frames == 0:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            inputs = captioning_processor(
                frame,
                return_tensors="pt"
            )
            out = captioning_model.generate(**inputs)
            cur_caption = (
                captioning_processor.decode(out[0], skip_special_tokens=True)
            )
            captions += [cur_caption]
            saved_frame_count += 1

        frame_count += 1

    cap.release()
    return json.dumps(captions)