Spaces:
Sleeping
Sleeping
File size: 1,446 Bytes
3dc1a7f 07f5718 3dc1a7f 07f5718 3dc1a7f 07f5718 3dc1a7f 07f5718 3dc1a7f 07f5718 3dc1a7f 07f5718 3dc1a7f 07f5718 3dc1a7f 07f5718 3dc1a7f 07f5718 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
from pathlib import Path
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from transformers import pipeline
# Retriever for top-5 relevant document chunks
def init_retriever():
Path("data").mkdir(exist_ok=True)
docs = []
for pdf in Path("data").glob("*.pdf"):
loader = PyMuPDFLoader(str(pdf))
docs.extend(loader.load())
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(docs)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/LaBSE", model_kwargs={"device": "cpu"})
vectordb = Chroma.from_documents(chunks, embeddings, persist_directory="chroma_db")
return vectordb.as_retriever(search_kwargs={"k": 5})
retriever = init_retriever()
# Arabic QA pipeline (extractive)
qa_pipeline = pipeline(
"question-answering",
model="ZeyadAhmed/AraElectra-Arabic-SQuADv2-QA",
tokenizer="ZeyadAhmed/AraElectra-Arabic-SQuADv2-QA",
device=-1
)
def answer(question: str) -> str:
docs = retriever.get_relevant_documents(question)
context = "\n\n".join(d.page_content for d in docs)
out = qa_pipeline(question=question, context=context)
return out.get("answer", "عفواً، لم أجد إجابة واضحة.")
|