Spaces:
Sleeping
Sleeping
from pdf_loader import load_pdf | |
from optimal_chunker import chunk_documents | |
from embedder_light import get_embedder, embed_text | |
from vector_store import get_chroma_client, create_collection | |
class RAGPipeline: | |
def __init__(self): | |
self.tokenizer, self.model = get_embedder() | |
self.db_client = get_chroma_client() | |
self.collection = create_collection(self.db_client) | |
def index_document(self, pdf_path): | |
print(f"📄 Loading: {pdf_path}") | |
docs = load_pdf(pdf_path) | |
print("✂️ Chunking...") | |
chunks = chunk_documents(docs) | |
print("🔢 Creating embeddings...") | |
texts = [chunk.page_content for chunk in chunks] | |
vectors = embed_text(texts, self.tokenizer, self.model) | |
print("🧠 Adding to ChromaDB...") | |
ids = [f"doc_{i}" for i in range(len(texts))] | |
self.collection.add(documents=texts, embeddings=vectors, ids=ids) | |
print(f"✅ Indexed {len(texts)} chunks.") | |
def query(self, question): | |
print(f"❓ Question: {question}") | |
question_vec = embed_text([question], self.tokenizer, self.model)[0] | |
results = self.collection.query( | |
query_embeddings=[question_vec], | |
n_results=3 | |
) | |
print("\n🔍 Top Documents:") | |
for i, doc in enumerate(results["documents"][0]): | |
print(f"{i+1}. {doc[:200]}...\n") | |
# HF Spaces output üçün cavabı qaytar | |
return "\n\n".join([f"{i+1}. {doc[:500]}" for i, doc in enumerate(results["documents"][0])]) | |
if __name__ == "__main__": | |
rag = RAGPipeline() | |
rag.index_document("sample.pdf") | |
rag.query("What is this document about?") | |