File size: 3,870 Bytes
a5fc5ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import json
import numpy as np
import cv2

from dotenv import load_dotenv
from openai import OpenAI
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
from skimage.metrics import structural_similarity as ssim


# Load env for OpenAI
load_dotenv()
client = OpenAI()

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


def evaluate_with_gpt4(storyboard, video_description):
    system_prompt = (
        "You are a film critic evaluating how well a video matches a storyboard.\n"
        "Rate each of the following from 1 to 10:\n"
        "- Story Consistency: Does the video follow the scene and emotion described?\n"
        "- Shot Variety: Does it use interesting or varied camera angles?\n"
        "- Relevance: Does it suit the intended purpose (role, setting, emotion)?\n\n"
        "Provide scores and brief justifications for each.\n\n"
        "Format output as:\n"
        "{\n"
        "  \"story_consistency\": <score>,\n"
        "  \"shot_variety\": <score>,\n"
        "  \"relevance\": <score>,\n"
        "  \"justification\": \"...\"\n"
        "}"
    )

    user_prompt = (
        f"Storyboard:\n"
        f"Scene: {storyboard['scene']}\n"
        f"Shot: {storyboard['shot_type']}\n"
        f"Emotion: {storyboard['emotion']}\n\n"
        f"Video Description:\n{video_description}"
    )

    response = client.chat.completions.create(
        model="gpt-4o",
        temperature=0.3,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]
    )

    content = response.choices[0].message.content.strip()
    return json.loads(content)


def compute_clip_similarity(image_path, text_prompt):
    image = Image.open(image_path).convert("RGB")
    inputs = clip_processor(text=[text_prompt], images=image, return_tensors="pt", padding=True)
    outputs = clip_model(**inputs)
    logits_per_image = outputs.logits_per_image
    similarity = logits_per_image.softmax(dim=1).item()
    return similarity


def compute_motion_score(video_path):
    cap = cv2.VideoCapture(video_path)
    prev_gray = None
    motion_values = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        if prev_gray is not None:
            flow = cv2.calcOpticalFlowFarneback(prev_gray, gray, None,
                                                 0.5, 3, 15, 3, 5, 1.2, 0)
            magnitude, _ = cv2.cartToPolar(flow[..., 0], flow[..., 1])
            motion_values.append(np.mean(magnitude))

        prev_gray = gray

    cap.release()
    return np.mean(motion_values) if motion_values else 0


def compute_temporal_coherence(video_path):
    cap = cv2.VideoCapture(video_path)
    prev_frame = None
    ssim_scores = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        if prev_frame is not None:
            score = ssim(prev_frame, gray)
            ssim_scores.append(score)

        prev_frame = gray

    cap.release()
    return np.mean(ssim_scores) if ssim_scores else 0


def evaluate_video(storyboard, video_description, video_path, thumbnail_path, text_prompt):
    gpt_eval = evaluate_with_gpt4(storyboard, video_description)
    clip_score = compute_clip_similarity(thumbnail_path, text_prompt)
    motion_score = compute_motion_score(video_path)
    coherence_score = compute_temporal_coherence(video_path)

    return {
        "gpt_eval": gpt_eval,
        "metrics": {
            "clip_similarity": clip_score,
            "motion_score": motion_score,
            "temporal_coherence": coherence_score
        }
    }