Spaces:
Sleeping
Sleeping
from pdf_loader import load_pdf | |
from optimal_chunker import chunk_documents | |
from embedder_light import get_embedder, embed_text | |
from vector_store import get_chroma_client, create_collection | |
class RAGPipeline: | |
def __init__(self): | |
self.tokenizer, self.model = get_embedder() | |
self.db_client = get_chroma_client() | |
self.collection = create_collection(self.db_client) | |
def index_document(self, pdf_path): | |
print(f"π Loading: {pdf_path}") | |
docs = load_pdf(pdf_path) | |
print("βοΈ Chunking...") | |
chunks = chunk_documents(docs) | |
print("π’ Creating embeddings...") | |
texts = [chunk.page_content for chunk in chunks] | |
vectors = embed_text(texts, self.tokenizer, self.model) | |
print("π§ Adding to ChromaDB...") | |
ids = [f"doc_{i}" for i in range(len(texts))] | |
self.collection.add(documents=texts, embeddings=vectors, ids=ids) | |
print(f"β Indexed {len(texts)} chunks.") | |
def query(self, question): | |
print(f"β Question: {question}") | |
question_vec = embed_text([question], self.tokenizer, self.model)[0] | |
results = self.collection.query( | |
query_embeddings=[question_vec], | |
n_results=3 | |
) | |
print(" | |
π Top Documents:") | |
for i, doc in enumerate(results["documents"][0]): | |
print(f"{i+1}. {doc[:200]}... | |
") | |
return results["documents"][0][0] |