Schmitz005 commited on
Commit
0468fdd
·
verified ·
1 Parent(s): 8baa906

Create rag.py

Browse files
Files changed (1) hide show
  1. whale_core/rag.py +37 -0
whale_core/rag.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ from pymongo import MongoClient
3
+ import numpy as np
4
+
5
+ model = SentenceTransformer('all-MiniLM-L6-v2')
6
+ client = MongoClient()
7
+ db = client['huggingwhale']
8
+ collection = db['docs']
9
+
10
+ def chunk_text(text, chunk_size=300):
11
+ words = text.split()
12
+ return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
13
+
14
+ def embed_chunks(chunks):
15
+ return model.encode(chunks).tolist()
16
+
17
+ def store_embeddings(chunks, embeddings):
18
+ docs = [
19
+ {"chunk": chunk, "embedding": emb}
20
+ for chunk, emb in zip(chunks, embeddings)
21
+ ]
22
+ collection.insert_many(docs)
23
+
24
+ def query_rag(question, top_k=3):
25
+ question_vec = model.encode([question])[0]
26
+ results = collection.aggregate([
27
+ {
28
+ "$vectorSearch": {
29
+ "index": "default",
30
+ "path": "embedding",
31
+ "queryVector": question_vec,
32
+ "numCandidates": 100,
33
+ "limit": top_k
34
+ }
35
+ }
36
+ ])
37
+ return [doc['chunk'] for doc in results]