ramysaidagieb commited on
Commit
99354e0
·
verified ·
1 Parent(s): 12fd1c0

Upload 5 files

Browse files
Files changed (5) hide show
  1. README.md +22 -14
  2. app.py +47 -0
  3. rag_pipeline.py +53 -0
  4. requirements.txt +10 -0
  5. utils.py +32 -0
README.md CHANGED
@@ -1,14 +1,22 @@
1
- ---
2
- title: RagGV1
3
- emoji: 📊
4
- colorFrom: blue
5
- colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 5.30.0
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- short_description: think
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
1
+ # 🤖 Arabic RAG Assistant - Pope Shenouda III Faith Books
2
+
3
+ This Hugging Face Space reads Arabic PDF and DOCX documents, indexes their content using FAISS, and answers Arabic questions with cited source passages.
4
+
5
+ ## Features
6
+
7
+ - Supports multiple file uploads (PDF/DOCX)
8
+ - Parses and chunks Arabic text
9
+ - Retrieves relevant text for question answering
10
+ - Generates answers using a multilingual open-source LLM
11
+ - Exports answers + citations as a Word file
12
+
13
+ ## Instructions
14
+
15
+ 1. Upload Arabic books (PDF or DOCX)
16
+ 2. Ask your question in Arabic
17
+ 3. Get an answer with cited passages
18
+ 4. Download the answer as a Word document
19
+
20
+ ## License
21
+
22
+ Open-source for educational use.
app.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from utils import extract_texts_from_files, clean_arabic, chunk_text
3
+ from rag_pipeline import ArabicRAGPipeline, save_to_doc
4
+
5
+ rag = ArabicRAGPipeline()
6
+
7
+ def process_files(files):
8
+ all_chunks = []
9
+ for file in files:
10
+ text = extract_texts_from_files(file)
11
+ if not text:
12
+ continue
13
+ clean_text = clean_arabic(text)
14
+ chunks = chunk_text(clean_text, source=file.name)
15
+ all_chunks.extend(chunks)
16
+ rag.build_index(all_chunks)
17
+ return "✅ تم تحميل وفهرسة الملفات بنجاح", None
18
+
19
+ def ask_question(question):
20
+ passages = rag.retrieve(question)
21
+ answer, cited_passages = rag.generate_answer(question, passages)
22
+ citations = "\n\n".join(f"📌 {src}" for _, src in cited_passages)
23
+ return answer, citations
24
+
25
+ def export_answer(answer, citations):
26
+ return save_to_doc(answer, citations)
27
+
28
+ with gr.Blocks(theme=gr.themes.Base(), css="body { background-color: #111; color: #eee; font-family: 'Cairo', sans-serif; }") as demo:
29
+ gr.Image("assets/logo.png", height=120)
30
+ gr.Markdown("### 🤖 مساعد الإيمان - روبوت ذكي لتحليل كتب البابا شنودة الثالث")
31
+
32
+ with gr.Row():
33
+ file_input = gr.File(label="📚 تحميل ملفات PDF أو DOCX", file_types=[".pdf", ".docx"], file_count="multiple")
34
+ file_status = gr.Textbox(label="📌 الحالة", interactive=False)
35
+
36
+ file_input.change(fn=process_files, inputs=file_input, outputs=file_status)
37
+
38
+ question_input = gr.Textbox(label="✍️ اكتب سؤالك هنا", placeholder="مثال: ما هو دور الإيمان في المعجزات؟")
39
+ answer_output = gr.Textbox(label="🧠 الإجابة", lines=5)
40
+ citations_output = gr.Textbox(label="🔖 المراجع المستخدمة", lines=10)
41
+ export_btn = gr.Button("💾 حفظ الإجابة كمستند")
42
+ output_file = gr.File(label="📥 تحميل الملف")
43
+
44
+ question_input.submit(fn=ask_question, inputs=question_input, outputs=[answer_output, citations_output])
45
+ export_btn.click(fn=export_answer, inputs=[answer_output, citations_output], outputs=output_file)
46
+
47
+ demo.launch()
rag_pipeline.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ import faiss
3
+ import numpy as np
4
+ from transformers import pipeline
5
+ from docx import Document
6
+
7
+ class ArabicRAGPipeline:
8
+ def __init__(self):
9
+ self.embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
10
+ self.retriever_index = None
11
+ self.text_chunks = []
12
+ self.chunk_embeddings = None
13
+ self.generator = pipeline(
14
+ "text-generation",
15
+ model="NousResearch/Nous-Hermes-2-Mistral",
16
+ tokenizer="NousResearch/Nous-Hermes-2-Mistral",
17
+ max_new_tokens=256
18
+ )
19
+
20
+ def build_index(self, chunks):
21
+ self.text_chunks = chunks
22
+ texts = [chunk[0] for chunk in chunks]
23
+ self.chunk_embeddings = self.embedding_model.encode(texts, convert_to_tensor=False)
24
+ dim = self.chunk_embeddings[0].shape[0]
25
+ self.retriever_index = faiss.IndexFlatL2(dim)
26
+ self.retriever_index.add(np.array(self.chunk_embeddings))
27
+
28
+ def retrieve(self, query, top_k=3):
29
+ query_vec = self.embedding_model.encode([query])[0]
30
+ scores, indices = self.retriever_index.search(np.array([query_vec]), top_k)
31
+ return [self.text_chunks[i] for i in indices[0]]
32
+
33
+ def generate_answer(self, query, retrieved_passages):
34
+ context = "\n\n".join(p for p, _ in retrieved_passages)
35
+ prompt = f"""أجب باللغة العربية الفصحى على السؤال التالي، بالاعتماد فقط على النصوص التالية. قدم إجابة مدعومة من النص الأصلي، واذكر المرجع المستخدم:
36
+
37
+ النصوص:
38
+ {context}
39
+
40
+ السؤال: {query}
41
+ الإجابة:"""
42
+ response = self.generator(prompt)[0]['generated_text']
43
+ return response.split("الإجابة:")[-1].strip(), retrieved_passages
44
+
45
+ def save_to_doc(answer, citations):
46
+ doc = Document()
47
+ doc.add_heading("الإجابة", level=1)
48
+ doc.add_paragraph(answer)
49
+ doc.add_heading("المصادر", level=2)
50
+ doc.add_paragraph(citations)
51
+ path = "/tmp/faith_answer.docx"
52
+ doc.save(path)
53
+ return path
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ sentence-transformers
4
+ faiss-cpu
5
+ PyMuPDF
6
+ python-docx
7
+ llama-cpp-python
8
+ arabic_reshaper
9
+ python-bidi
10
+ scikit-learn
utils.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz
2
+ import docx
3
+ import re
4
+
5
+ def extract_texts_from_files(file):
6
+ try:
7
+ if file.name.endswith(".pdf"):
8
+ doc = fitz.open(stream=file.read(), filetype="pdf")
9
+ return "\n".join(page.get_text() for page in doc)
10
+ elif file.name.endswith(".docx"):
11
+ d = docx.Document(file)
12
+ return "\n".join(p.text for p in d.paragraphs)
13
+ except Exception:
14
+ return ""
15
+
16
+ def clean_arabic(text):
17
+ text = re.sub(r'[^\u0600-\u06FF\s]', '', text) # Arabic chars only
18
+ text = re.sub(r'\s+', ' ', text).strip()
19
+ return text
20
+
21
+ def chunk_text(text, source="مصدر غير معروف", max_words=150):
22
+ sentences = re.split(r'(?<=[.!؟])\s+', text)
23
+ chunks = []
24
+ current = []
25
+ for sentence in sentences:
26
+ current.append(sentence)
27
+ if len(" ".join(current).split()) > max_words:
28
+ chunks.append((" ".join(current), source))
29
+ current = []
30
+ if current:
31
+ chunks.append((" ".join(current), source))
32
+ return chunks