ramysaidagieb commited on
Commit
3dc1a7f
·
verified ·
1 Parent(s): e910126

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +16 -0
  2. rag_pipeline.py +39 -0
  3. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from rag_pipeline import answer_question
3
+
4
+ def qa_interface(question):
5
+ return answer_question(question)
6
+
7
+ iface = gr.Interface(
8
+ fn=qa_interface,
9
+ inputs=gr.Textbox(lines=3, placeholder="اكتب سؤالك هنا...", label="سؤالك", rtl=True),
10
+ outputs=gr.Textbox(label="الإجابة", rtl=True),
11
+ title="🤖 روبوت سؤال وجواب عربي باستخدام PDF",
12
+ description="ارفع ملفات PDF باللغة العربية، واسأل عنها مباشرة (يدعم اللهجة المصرية ويعمل على CPU)."
13
+ )
14
+
15
+ if __name__ == "__main__":
16
+ iface.launch()
rag_pipeline.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from langchain_community.document_loaders import PyMuPDFLoader
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain_community.vectorstores import Chroma
5
+ from langchain_huggingface import HuggingFaceEmbeddings
6
+ from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
7
+
8
+ # Load and index Arabic documents
9
+ def load_and_index():
10
+ pdf_dir = Path("data")
11
+ pdf_dir.mkdir(exist_ok=True)
12
+ docs = []
13
+ for pdf_file in pdf_dir.glob("*.pdf"):
14
+ loader = PyMuPDFLoader(str(pdf_file))
15
+ docs.extend(loader.load())
16
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
17
+ split_docs = splitter.split_documents(docs)
18
+
19
+ embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/LaBSE", model_kwargs={"device": "cpu"})
20
+ db = Chroma.from_documents(split_docs, embedding, persist_directory="chroma_db")
21
+ return db.as_retriever(search_kwargs={"k": 5})
22
+
23
+ # Load Arabic QA model
24
+ qa_pipeline = pipeline(
25
+ "question-answering",
26
+ model=AutoModelForQuestionAnswering.from_pretrained("alyaa82/aravec-bert-base-qa"),
27
+ tokenizer=AutoTokenizer.from_pretrained("alyaa82/aravec-bert-base-qa"),
28
+ device=-1
29
+ )
30
+
31
+ # Get retriever once
32
+ retriever = load_and_index()
33
+
34
+ # Perform retrieval + QA
35
+ def answer_question(question: str) -> str:
36
+ docs = retriever.get_relevant_documents(question)
37
+ context = "\n\n".join(doc.page_content for doc in docs)
38
+ result = qa_pipeline(question=question, context=context)
39
+ return result['answer']
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ sentence-transformers
4
+ langchain
5
+ langchain-community
6
+ langchain-huggingface
7
+ chromadb
8
+ pymupdf