import os import json from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity model = SentenceTransformer("all-MiniLM-L6-v2") def build_embeddings(video_folder="data/videos", output_file="data/video_metadata.json"): metadata = [] for file in os.listdir(video_folder): if file.endswith((".mp4", ".mov", ".avi")): description = file.replace("_", " ").replace(".mp4", "").replace(".mov", "").replace(".avi", "") embedding = model.encode(description).tolist() metadata.append({ "file": file, "description": description, "embedding": embedding }) with open(output_file, "w") as f: json.dump(metadata, f) def find_video(user_query, metadata_file="data/video_metadata.json"): with open(metadata_file, "r") as f: videos = json.load(f) query_embedding = model.encode(user_query).reshape(1, -1) similarities = [cosine_similarity([v["embedding"]], query_embedding)[0][0] for v in videos] best_idx = similarities.index(max(similarities)) return videos[best_idx]["file"], videos[best_idx]["description"]