RAG47V3 / rag_pipeline.py
ramysaidagieb's picture
Update rag_pipeline.py
07f5718 verified
from pathlib import Path
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from transformers import pipeline
# Retriever for top-5 relevant document chunks
def init_retriever():
Path("data").mkdir(exist_ok=True)
docs = []
for pdf in Path("data").glob("*.pdf"):
loader = PyMuPDFLoader(str(pdf))
docs.extend(loader.load())
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(docs)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/LaBSE", model_kwargs={"device": "cpu"})
vectordb = Chroma.from_documents(chunks, embeddings, persist_directory="chroma_db")
return vectordb.as_retriever(search_kwargs={"k": 5})
retriever = init_retriever()
# Arabic QA pipeline (extractive)
qa_pipeline = pipeline(
"question-answering",
model="ZeyadAhmed/AraElectra-Arabic-SQuADv2-QA",
tokenizer="ZeyadAhmed/AraElectra-Arabic-SQuADv2-QA",
device=-1
)
def answer(question: str) -> str:
docs = retriever.get_relevant_documents(question)
context = "\n\n".join(d.page_content for d in docs)
out = qa_pipeline(question=question, context=context)
return out.get("answer", "عفواً، لم أجد إجابة واضحة.")