File size: 1,058 Bytes
cfd2831
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from sentence_transformers import SentenceTransformer
from pymongo import MongoClient
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2')
client = MongoClient()
db = client['huggingwhale']
collection = db['docs']

def chunk_text(text, chunk_size=300):
    words = text.split()
    return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

def embed_chunks(chunks):
    return model.encode(chunks).tolist()

def store_embeddings(chunks, embeddings):
    docs = [
        {"chunk": chunk, "embedding": emb}
        for chunk, emb in zip(chunks, embeddings)
    ]
    collection.insert_many(docs)

def query_rag(question, top_k=3):
    question_vec = model.encode([question])[0]
    results = collection.aggregate([
        {
            "$vectorSearch": {
                "index": "default",
                "path": "embedding",
                "queryVector": question_vec,
                "numCandidates": 100,
                "limit": top_k
            }
        }
    ])
    return [doc['chunk'] for doc in results]