Spaces:
Sleeping
Sleeping
from pathlib import Path | |
from langchain_community.document_loaders import PyMuPDFLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.vectorstores import Chroma | |
from langchain_huggingface import HuggingFaceEmbeddings | |
from transformers import pipeline | |
# Retriever for top-5 relevant document chunks | |
def init_retriever(): | |
Path("data").mkdir(exist_ok=True) | |
docs = [] | |
for pdf in Path("data").glob("*.pdf"): | |
loader = PyMuPDFLoader(str(pdf)) | |
docs.extend(loader.load()) | |
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
chunks = splitter.split_documents(docs) | |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/LaBSE", model_kwargs={"device": "cpu"}) | |
vectordb = Chroma.from_documents(chunks, embeddings, persist_directory="chroma_db") | |
return vectordb.as_retriever(search_kwargs={"k": 5}) | |
retriever = init_retriever() | |
# Arabic QA pipeline (extractive) | |
qa_pipeline = pipeline( | |
"question-answering", | |
model="ZeyadAhmed/AraElectra-Arabic-SQuADv2-QA", | |
tokenizer="ZeyadAhmed/AraElectra-Arabic-SQuADv2-QA", | |
device=-1 | |
) | |
def answer(question: str) -> str: | |
docs = retriever.get_relevant_documents(question) | |
context = "\n\n".join(d.page_content for d in docs) | |
out = qa_pipeline(question=question, context=context) | |
return out.get("answer", "عفواً، لم أجد إجابة واضحة.") | |