File size: 1,201 Bytes
8077ead
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import os
import json
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

model = SentenceTransformer("all-MiniLM-L6-v2")

def build_embeddings(video_folder="data/videos", output_file="data/video_metadata.json"):
    metadata = []
    for file in os.listdir(video_folder):
        if file.endswith((".mp4", ".mov", ".avi")):
            description = file.replace("_", " ").replace(".mp4", "").replace(".mov", "").replace(".avi", "")
            embedding = model.encode(description).tolist()
            metadata.append({
                "file": file,
                "description": description,
                "embedding": embedding
            })
    with open(output_file, "w") as f:
        json.dump(metadata, f)

def find_video(user_query, metadata_file="data/video_metadata.json"):
    with open(metadata_file, "r") as f:
        videos = json.load(f)

    query_embedding = model.encode(user_query).reshape(1, -1)
    similarities = [cosine_similarity([v["embedding"]], query_embedding)[0][0] for v in videos]
    best_idx = similarities.index(max(similarities))

    return videos[best_idx]["file"], videos[best_idx]["description"]