Spaces:
Sleeping
Sleeping
import os | |
import json | |
from sentence_transformers import SentenceTransformer | |
from sklearn.metrics.pairwise import cosine_similarity | |
model = SentenceTransformer("all-MiniLM-L6-v2") | |
def build_embeddings(video_folder="data/videos", output_file="data/video_metadata.json"): | |
metadata = [] | |
for file in os.listdir(video_folder): | |
if file.endswith((".mp4", ".mov", ".avi")): | |
description = file.replace("_", " ").replace(".mp4", "").replace(".mov", "").replace(".avi", "") | |
embedding = model.encode(description).tolist() | |
metadata.append({ | |
"file": file, | |
"description": description, | |
"embedding": embedding | |
}) | |
with open(output_file, "w") as f: | |
json.dump(metadata, f) | |
def find_video(user_query, metadata_file="data/video_metadata.json"): | |
with open(metadata_file, "r") as f: | |
videos = json.load(f) | |
query_embedding = model.encode(user_query).reshape(1, -1) | |
similarities = [cosine_similarity([v["embedding"]], query_embedding)[0][0] for v in videos] | |
best_idx = similarities.index(max(similarities)) | |
return videos[best_idx]["file"], videos[best_idx]["description"] | |