Spaces:
Sleeping
Sleeping
from sentence_transformers import SentenceTransformer | |
from pymongo import MongoClient | |
import numpy as np | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
client = MongoClient() | |
db = client['huggingwhale'] | |
collection = db['docs'] | |
def chunk_text(text, chunk_size=300): | |
words = text.split() | |
return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)] | |
def embed_chunks(chunks): | |
return model.encode(chunks).tolist() | |
def store_embeddings(chunks, embeddings): | |
docs = [ | |
{"chunk": chunk, "embedding": emb} | |
for chunk, emb in zip(chunks, embeddings) | |
] | |
collection.insert_many(docs) | |
def query_rag(question, top_k=3): | |
question_vec = model.encode([question])[0] | |
results = collection.aggregate([ | |
{ | |
"$vectorSearch": { | |
"index": "default", | |
"path": "embedding", | |
"queryVector": question_vec, | |
"numCandidates": 100, | |
"limit": top_k | |
} | |
} | |
]) | |
return [doc['chunk'] for doc in results] | |