First_RAG_System / rag_system.py
HamidOmarov's picture
Update rag_system.py
aad903a verified
from pdf_loader import load_pdf
from optimal_chunker import chunk_documents
from embedder_light import get_embedder, embed_text
from vector_store import get_chroma_client, create_collection
class RAGPipeline:
def __init__(self):
self.tokenizer, self.model = get_embedder()
self.db_client = get_chroma_client()
self.collection = create_collection(self.db_client)
def index_document(self, pdf_path):
print(f"📄 Loading: {pdf_path}")
docs = load_pdf(pdf_path)
print("✂️ Chunking...")
chunks = chunk_documents(docs)
print("🔢 Creating embeddings...")
texts = [chunk.page_content for chunk in chunks]
vectors = embed_text(texts, self.tokenizer, self.model)
print("🧠 Adding to ChromaDB...")
ids = [f"doc_{i}" for i in range(len(texts))]
self.collection.add(documents=texts, embeddings=vectors, ids=ids)
print(f"✅ Indexed {len(texts)} chunks.")
def query(self, question):
print(f"❓ Question: {question}")
question_vec = embed_text([question], self.tokenizer, self.model)[0]
results = self.collection.query(
query_embeddings=[question_vec],
n_results=3
)
print("\n🔍 Top Documents:")
for i, doc in enumerate(results["documents"][0]):
print(f"{i+1}. {doc[:200]}...\n")
# HF Spaces output üçün cavabı qaytar
return "\n\n".join([f"{i+1}. {doc[:500]}" for i, doc in enumerate(results["documents"][0])])
if __name__ == "__main__":
rag = RAGPipeline()
rag.index_document("sample.pdf")
rag.query("What is this document about?")