Spaces:
Sleeping
Sleeping
Upload 5 files
Browse files- README.md +22 -14
- app.py +47 -0
- rag_pipeline.py +53 -0
- requirements.txt +10 -0
- utils.py +32 -0
README.md
CHANGED
@@ -1,14 +1,22 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 🤖 Arabic RAG Assistant - Pope Shenouda III Faith Books
|
2 |
+
|
3 |
+
This Hugging Face Space reads Arabic PDF and DOCX documents, indexes their content using FAISS, and answers Arabic questions with cited source passages.
|
4 |
+
|
5 |
+
## Features
|
6 |
+
|
7 |
+
- Supports multiple file uploads (PDF/DOCX)
|
8 |
+
- Parses and chunks Arabic text
|
9 |
+
- Retrieves relevant text for question answering
|
10 |
+
- Generates answers using a multilingual open-source LLM
|
11 |
+
- Exports answers + citations as a Word file
|
12 |
+
|
13 |
+
## Instructions
|
14 |
+
|
15 |
+
1. Upload Arabic books (PDF or DOCX)
|
16 |
+
2. Ask your question in Arabic
|
17 |
+
3. Get an answer with cited passages
|
18 |
+
4. Download the answer as a Word document
|
19 |
+
|
20 |
+
## License
|
21 |
+
|
22 |
+
Open-source for educational use.
|
app.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from utils import extract_texts_from_files, clean_arabic, chunk_text
|
3 |
+
from rag_pipeline import ArabicRAGPipeline, save_to_doc
|
4 |
+
|
5 |
+
rag = ArabicRAGPipeline()
|
6 |
+
|
7 |
+
def process_files(files):
|
8 |
+
all_chunks = []
|
9 |
+
for file in files:
|
10 |
+
text = extract_texts_from_files(file)
|
11 |
+
if not text:
|
12 |
+
continue
|
13 |
+
clean_text = clean_arabic(text)
|
14 |
+
chunks = chunk_text(clean_text, source=file.name)
|
15 |
+
all_chunks.extend(chunks)
|
16 |
+
rag.build_index(all_chunks)
|
17 |
+
return "✅ تم تحميل وفهرسة الملفات بنجاح", None
|
18 |
+
|
19 |
+
def ask_question(question):
|
20 |
+
passages = rag.retrieve(question)
|
21 |
+
answer, cited_passages = rag.generate_answer(question, passages)
|
22 |
+
citations = "\n\n".join(f"📌 {src}" for _, src in cited_passages)
|
23 |
+
return answer, citations
|
24 |
+
|
25 |
+
def export_answer(answer, citations):
|
26 |
+
return save_to_doc(answer, citations)
|
27 |
+
|
28 |
+
with gr.Blocks(theme=gr.themes.Base(), css="body { background-color: #111; color: #eee; font-family: 'Cairo', sans-serif; }") as demo:
|
29 |
+
gr.Image("assets/logo.png", height=120)
|
30 |
+
gr.Markdown("### 🤖 مساعد الإيمان - روبوت ذكي لتحليل كتب البابا شنودة الثالث")
|
31 |
+
|
32 |
+
with gr.Row():
|
33 |
+
file_input = gr.File(label="📚 تحميل ملفات PDF أو DOCX", file_types=[".pdf", ".docx"], file_count="multiple")
|
34 |
+
file_status = gr.Textbox(label="📌 الحالة", interactive=False)
|
35 |
+
|
36 |
+
file_input.change(fn=process_files, inputs=file_input, outputs=file_status)
|
37 |
+
|
38 |
+
question_input = gr.Textbox(label="✍️ اكتب سؤالك هنا", placeholder="مثال: ما هو دور الإيمان في المعجزات؟")
|
39 |
+
answer_output = gr.Textbox(label="🧠 الإجابة", lines=5)
|
40 |
+
citations_output = gr.Textbox(label="🔖 المراجع المستخدمة", lines=10)
|
41 |
+
export_btn = gr.Button("💾 حفظ الإجابة كمستند")
|
42 |
+
output_file = gr.File(label="📥 تحميل الملف")
|
43 |
+
|
44 |
+
question_input.submit(fn=ask_question, inputs=question_input, outputs=[answer_output, citations_output])
|
45 |
+
export_btn.click(fn=export_answer, inputs=[answer_output, citations_output], outputs=output_file)
|
46 |
+
|
47 |
+
demo.launch()
|
rag_pipeline.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sentence_transformers import SentenceTransformer
|
2 |
+
import faiss
|
3 |
+
import numpy as np
|
4 |
+
from transformers import pipeline
|
5 |
+
from docx import Document
|
6 |
+
|
7 |
+
class ArabicRAGPipeline:
|
8 |
+
def __init__(self):
|
9 |
+
self.embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
|
10 |
+
self.retriever_index = None
|
11 |
+
self.text_chunks = []
|
12 |
+
self.chunk_embeddings = None
|
13 |
+
self.generator = pipeline(
|
14 |
+
"text-generation",
|
15 |
+
model="NousResearch/Nous-Hermes-2-Mistral",
|
16 |
+
tokenizer="NousResearch/Nous-Hermes-2-Mistral",
|
17 |
+
max_new_tokens=256
|
18 |
+
)
|
19 |
+
|
20 |
+
def build_index(self, chunks):
|
21 |
+
self.text_chunks = chunks
|
22 |
+
texts = [chunk[0] for chunk in chunks]
|
23 |
+
self.chunk_embeddings = self.embedding_model.encode(texts, convert_to_tensor=False)
|
24 |
+
dim = self.chunk_embeddings[0].shape[0]
|
25 |
+
self.retriever_index = faiss.IndexFlatL2(dim)
|
26 |
+
self.retriever_index.add(np.array(self.chunk_embeddings))
|
27 |
+
|
28 |
+
def retrieve(self, query, top_k=3):
|
29 |
+
query_vec = self.embedding_model.encode([query])[0]
|
30 |
+
scores, indices = self.retriever_index.search(np.array([query_vec]), top_k)
|
31 |
+
return [self.text_chunks[i] for i in indices[0]]
|
32 |
+
|
33 |
+
def generate_answer(self, query, retrieved_passages):
|
34 |
+
context = "\n\n".join(p for p, _ in retrieved_passages)
|
35 |
+
prompt = f"""أجب باللغة العربية الفصحى على السؤال التالي، بالاعتماد فقط على النصوص التالية. قدم إجابة مدعومة من النص الأصلي، واذكر المرجع المستخدم:
|
36 |
+
|
37 |
+
النصوص:
|
38 |
+
{context}
|
39 |
+
|
40 |
+
السؤال: {query}
|
41 |
+
الإجابة:"""
|
42 |
+
response = self.generator(prompt)[0]['generated_text']
|
43 |
+
return response.split("الإجابة:")[-1].strip(), retrieved_passages
|
44 |
+
|
45 |
+
def save_to_doc(answer, citations):
|
46 |
+
doc = Document()
|
47 |
+
doc.add_heading("الإجابة", level=1)
|
48 |
+
doc.add_paragraph(answer)
|
49 |
+
doc.add_heading("المصادر", level=2)
|
50 |
+
doc.add_paragraph(citations)
|
51 |
+
path = "/tmp/faith_answer.docx"
|
52 |
+
doc.save(path)
|
53 |
+
return path
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
transformers
|
3 |
+
sentence-transformers
|
4 |
+
faiss-cpu
|
5 |
+
PyMuPDF
|
6 |
+
python-docx
|
7 |
+
llama-cpp-python
|
8 |
+
arabic_reshaper
|
9 |
+
python-bidi
|
10 |
+
scikit-learn
|
utils.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import fitz
|
2 |
+
import docx
|
3 |
+
import re
|
4 |
+
|
5 |
+
def extract_texts_from_files(file):
|
6 |
+
try:
|
7 |
+
if file.name.endswith(".pdf"):
|
8 |
+
doc = fitz.open(stream=file.read(), filetype="pdf")
|
9 |
+
return "\n".join(page.get_text() for page in doc)
|
10 |
+
elif file.name.endswith(".docx"):
|
11 |
+
d = docx.Document(file)
|
12 |
+
return "\n".join(p.text for p in d.paragraphs)
|
13 |
+
except Exception:
|
14 |
+
return ""
|
15 |
+
|
16 |
+
def clean_arabic(text):
|
17 |
+
text = re.sub(r'[^\u0600-\u06FF\s]', '', text) # Arabic chars only
|
18 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
19 |
+
return text
|
20 |
+
|
21 |
+
def chunk_text(text, source="مصدر غير معروف", max_words=150):
|
22 |
+
sentences = re.split(r'(?<=[.!؟])\s+', text)
|
23 |
+
chunks = []
|
24 |
+
current = []
|
25 |
+
for sentence in sentences:
|
26 |
+
current.append(sentence)
|
27 |
+
if len(" ".join(current).split()) > max_words:
|
28 |
+
chunks.append((" ".join(current), source))
|
29 |
+
current = []
|
30 |
+
if current:
|
31 |
+
chunks.append((" ".join(current), source))
|
32 |
+
return chunks
|