File size: 1,681 Bytes
e02136d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aad903a
e02136d
aad903a
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from pdf_loader import load_pdf
from optimal_chunker import chunk_documents
from embedder_light import get_embedder, embed_text
from vector_store import get_chroma_client, create_collection

class RAGPipeline:
    def __init__(self):
        self.tokenizer, self.model = get_embedder()
        self.db_client = get_chroma_client()
        self.collection = create_collection(self.db_client)

    def index_document(self, pdf_path):
        print(f"📄 Loading: {pdf_path}")
        docs = load_pdf(pdf_path)

        print("✂️ Chunking...")
        chunks = chunk_documents(docs)

        print("🔢 Creating embeddings...")
        texts = [chunk.page_content for chunk in chunks]
        vectors = embed_text(texts, self.tokenizer, self.model)

        print("🧠 Adding to ChromaDB...")
        ids = [f"doc_{i}" for i in range(len(texts))]
        self.collection.add(documents=texts, embeddings=vectors, ids=ids)

        print(f"✅ Indexed {len(texts)} chunks.")

    def query(self, question):
        print(f"❓ Question: {question}")
        question_vec = embed_text([question], self.tokenizer, self.model)[0]

        results = self.collection.query(
            query_embeddings=[question_vec],
            n_results=3
        )

        print("\n🔍 Top Documents:")
        for i, doc in enumerate(results["documents"][0]):
            print(f"{i+1}. {doc[:200]}...\n")

        # HF Spaces output üçün cavabı qaytar
        return "\n\n".join([f"{i+1}. {doc[:500]}" for i, doc in enumerate(results["documents"][0])])

if __name__ == "__main__":
    rag = RAGPipeline()
    rag.index_document("sample.pdf")
    rag.query("What is this document about?")