from pdf_loader import load_pdf from optimal_chunker import chunk_documents from embedder_light import get_embedder, embed_text from vector_store import get_chroma_client, create_collection class RAGPipeline: def __init__(self): self.tokenizer, self.model = get_embedder() self.db_client = get_chroma_client() self.collection = create_collection(self.db_client) def index_document(self, pdf_path): print(f"📄 Loading: {pdf_path}") docs = load_pdf(pdf_path) print("✂️ Chunking...") chunks = chunk_documents(docs) print("🔢 Creating embeddings...") texts = [chunk.page_content for chunk in chunks] vectors = embed_text(texts, self.tokenizer, self.model) print("🧠 Adding to ChromaDB...") ids = [f"doc_{i}" for i in range(len(texts))] self.collection.add(documents=texts, embeddings=vectors, ids=ids) print(f"✅ Indexed {len(texts)} chunks.") def query(self, question): print(f"❓ Question: {question}") question_vec = embed_text([question], self.tokenizer, self.model)[0] results = self.collection.query( query_embeddings=[question_vec], n_results=3 ) print("\n🔍 Top Documents:") for i, doc in enumerate(results["documents"][0]): print(f"{i+1}. {doc[:200]}...\n") # HF Spaces output üçün cavabı qaytar return "\n\n".join([f"{i+1}. {doc[:500]}" for i, doc in enumerate(results["documents"][0])]) if __name__ == "__main__": rag = RAGPipeline() rag.index_document("sample.pdf") rag.query("What is this document about?")