Spaces:

Schmitz005
/

huggingwhale.ai

Sleeping

Create rag.py

0468fdd verified 26 days ago

1.06 kB

	from sentence_transformers import SentenceTransformer
	from pymongo import MongoClient
	import numpy as np

	model = SentenceTransformer('all-MiniLM-L6-v2')
	client = MongoClient()
	db = client['huggingwhale']
	collection = db['docs']

	def chunk_text(text, chunk_size=300):
	words = text.split()
	return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

	def embed_chunks(chunks):
	return model.encode(chunks).tolist()

	def store_embeddings(chunks, embeddings):
	docs = [
	{"chunk": chunk, "embedding": emb}
	for chunk, emb in zip(chunks, embeddings)
	]
	collection.insert_many(docs)

	def query_rag(question, top_k=3):
	question_vec = model.encode([question])[0]
	results = collection.aggregate([
	{
	"$vectorSearch": {
	"index": "default",
	"path": "embedding",
	"queryVector": question_vec,
	"numCandidates": 100,
	"limit": top_k
	}
	}
	])
	return [doc['chunk'] for doc in results]