from pathlib import Path from langchain_community.document_loaders import PyMuPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.vectorstores import Chroma from langchain_huggingface import HuggingFaceEmbeddings from transformers import pipeline # Retriever for top-5 relevant document chunks def init_retriever(): Path("data").mkdir(exist_ok=True) docs = [] for pdf in Path("data").glob("*.pdf"): loader = PyMuPDFLoader(str(pdf)) docs.extend(loader.load()) splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) chunks = splitter.split_documents(docs) embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/LaBSE", model_kwargs={"device": "cpu"}) vectordb = Chroma.from_documents(chunks, embeddings, persist_directory="chroma_db") return vectordb.as_retriever(search_kwargs={"k": 5}) retriever = init_retriever() # Arabic QA pipeline (extractive) qa_pipeline = pipeline( "question-answering", model="ZeyadAhmed/AraElectra-Arabic-SQuADv2-QA", tokenizer="ZeyadAhmed/AraElectra-Arabic-SQuADv2-QA", device=-1 ) def answer(question: str) -> str: docs = retriever.get_relevant_documents(question) context = "\n\n".join(d.page_content for d in docs) out = qa_pipeline(question=question, context=context) return out.get("answer", "عفواً، لم أجد إجابة واضحة.")