Create retriever.py
Browse files- retriever.py +30 -0
retriever.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
from sentence_transformers import SentenceTransformer
|
4 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
5 |
+
|
6 |
+
model = SentenceTransformer("all-MiniLM-L6-v2")
|
7 |
+
|
8 |
+
def build_embeddings(video_folder="data/videos", output_file="data/video_metadata.json"):
|
9 |
+
metadata = []
|
10 |
+
for file in os.listdir(video_folder):
|
11 |
+
if file.endswith((".mp4", ".mov", ".avi")):
|
12 |
+
description = file.replace("_", " ").replace(".mp4", "").replace(".mov", "").replace(".avi", "")
|
13 |
+
embedding = model.encode(description).tolist()
|
14 |
+
metadata.append({
|
15 |
+
"file": file,
|
16 |
+
"description": description,
|
17 |
+
"embedding": embedding
|
18 |
+
})
|
19 |
+
with open(output_file, "w") as f:
|
20 |
+
json.dump(metadata, f)
|
21 |
+
|
22 |
+
def find_video(user_query, metadata_file="data/video_metadata.json"):
|
23 |
+
with open(metadata_file, "r") as f:
|
24 |
+
videos = json.load(f)
|
25 |
+
|
26 |
+
query_embedding = model.encode(user_query).reshape(1, -1)
|
27 |
+
similarities = [cosine_similarity([v["embedding"]], query_embedding)[0][0] for v in videos]
|
28 |
+
best_idx = similarities.index(max(similarities))
|
29 |
+
|
30 |
+
return videos[best_idx]["file"], videos[best_idx]["description"]
|