Spaces:

ramysaidagieb
/

RagGV1

Sleeping

App Files Files Community

ramysaidagieb commited on May 22

Commit

99354e0

verified ·

1 Parent(s): 12fd1c0

Upload 5 files

Browse files

Files changed (5) hide show

README.md +22 -14
app.py +47 -0
rag_pipeline.py +53 -0
requirements.txt +10 -0
utils.py +32 -0

README.md CHANGED Viewed

@@ -1,14 +1,22 @@
----
-title: RagGV1
-emoji: 📊
-colorFrom: blue
-colorTo: indigo
-sdk: gradio
-sdk_version: 5.30.0
-app_file: app.py
-pinned: false
-license: mit
-short_description: think
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# 🤖 Arabic RAG Assistant - Pope Shenouda III Faith Books
+This Hugging Face Space reads Arabic PDF and DOCX documents, indexes their content using FAISS, and answers Arabic questions with cited source passages.
+## Features
+- Supports multiple file uploads (PDF/DOCX)
+- Parses and chunks Arabic text
+- Retrieves relevant text for question answering
+- Generates answers using a multilingual open-source LLM
+- Exports answers + citations as a Word file
+## Instructions
+1. Upload Arabic books (PDF or DOCX)
+2. Ask your question in Arabic
+3. Get an answer with cited passages
+4. Download the answer as a Word document
+## License
+Open-source for educational use.

app.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import gradio as gr
+from utils import extract_texts_from_files, clean_arabic, chunk_text
+from rag_pipeline import ArabicRAGPipeline, save_to_doc
+rag = ArabicRAGPipeline()
+def process_files(files):
+    all_chunks = []
+    for file in files:
+        text = extract_texts_from_files(file)
+        if not text:
+            continue
+        clean_text = clean_arabic(text)
+        chunks = chunk_text(clean_text, source=file.name)
+        all_chunks.extend(chunks)
+    rag.build_index(all_chunks)
+    return "✅ تم تحميل وفهرسة الملفات بنجاح", None
+def ask_question(question):
+    passages = rag.retrieve(question)
+    answer, cited_passages = rag.generate_answer(question, passages)
+    citations = "\n\n".join(f"📌 {src}" for _, src in cited_passages)
+    return answer, citations
+def export_answer(answer, citations):
+    return save_to_doc(answer, citations)
+with gr.Blocks(theme=gr.themes.Base(), css="body { background-color: #111; color: #eee; font-family: 'Cairo', sans-serif; }") as demo:
+    gr.Image("assets/logo.png", height=120)
+    gr.Markdown("### 🤖 مساعد الإيمان - روبوت ذكي لتحليل كتب البابا شنودة الثالث")
+    with gr.Row():
+        file_input = gr.File(label="📚 تحميل ملفات PDF أو DOCX", file_types=[".pdf", ".docx"], file_count="multiple")
+        file_status = gr.Textbox(label="📌 الحالة", interactive=False)
+    file_input.change(fn=process_files, inputs=file_input, outputs=file_status)
+    question_input = gr.Textbox(label="✍️ اكتب سؤالك هنا", placeholder="مثال: ما هو دور الإيمان في المعجزات؟")
+    answer_output = gr.Textbox(label="🧠 الإجابة", lines=5)
+    citations_output = gr.Textbox(label="🔖 المراجع المستخدمة", lines=10)
+    export_btn = gr.Button("💾 حفظ الإجابة كمستند")
+    output_file = gr.File(label="📥 تحميل الملف")
+    question_input.submit(fn=ask_question, inputs=question_input, outputs=[answer_output, citations_output])
+    export_btn.click(fn=export_answer, inputs=[answer_output, citations_output], outputs=output_file)
+demo.launch()

rag_pipeline.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from sentence_transformers import SentenceTransformer
+import faiss
+import numpy as np
+from transformers import pipeline
+from docx import Document
+class ArabicRAGPipeline:
+    def __init__(self):
+        self.embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
+        self.retriever_index = None
+        self.text_chunks = []
+        self.chunk_embeddings = None
+        self.generator = pipeline(
+            "text-generation",
+            model="NousResearch/Nous-Hermes-2-Mistral",
+            tokenizer="NousResearch/Nous-Hermes-2-Mistral",
+            max_new_tokens=256
+        )
+    def build_index(self, chunks):
+        self.text_chunks = chunks
+        texts = [chunk[0] for chunk in chunks]
+        self.chunk_embeddings = self.embedding_model.encode(texts, convert_to_tensor=False)
+        dim = self.chunk_embeddings[0].shape[0]
+        self.retriever_index = faiss.IndexFlatL2(dim)
+        self.retriever_index.add(np.array(self.chunk_embeddings))
+    def retrieve(self, query, top_k=3):
+        query_vec = self.embedding_model.encode([query])[0]
+        scores, indices = self.retriever_index.search(np.array([query_vec]), top_k)
+        return [self.text_chunks[i] for i in indices[0]]
+    def generate_answer(self, query, retrieved_passages):
+        context = "\n\n".join(p for p, _ in retrieved_passages)
+        prompt = f"""أجب باللغة العربية الفصحى على السؤال التالي، بالاعتماد فقط على النصوص التالية. قدم إجابة مدعومة من النص الأصلي، واذكر المرجع المستخدم:
+النصوص:
+{context}
+السؤال: {query}
+الإجابة:"""
+        response = self.generator(prompt)[0]['generated_text']
+        return response.split("الإجابة:")[-1].strip(), retrieved_passages
+def save_to_doc(answer, citations):
+    doc = Document()
+    doc.add_heading("الإجابة", level=1)
+    doc.add_paragraph(answer)
+    doc.add_heading("المصادر", level=2)
+    doc.add_paragraph(citations)
+    path = "/tmp/faith_answer.docx"
+    doc.save(path)
+    return path

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+gradio
+transformers
+sentence-transformers
+faiss-cpu
+PyMuPDF
+python-docx
+llama-cpp-python
+arabic_reshaper
+python-bidi
+scikit-learn

utils.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import fitz
+import docx
+import re
+def extract_texts_from_files(file):
+    try:
+        if file.name.endswith(".pdf"):
+            doc = fitz.open(stream=file.read(), filetype="pdf")
+            return "\n".join(page.get_text() for page in doc)
+        elif file.name.endswith(".docx"):
+            d = docx.Document(file)
+            return "\n".join(p.text for p in d.paragraphs)
+    except Exception:
+        return ""
+def clean_arabic(text):
+    text = re.sub(r'[^\u0600-\u06FF\s]', '', text)  # Arabic chars only
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
+def chunk_text(text, source="مصدر غير معروف", max_words=150):
+    sentences = re.split(r'(?<=[.!؟])\s+', text)
+    chunks = []
+    current = []
+    for sentence in sentences:
+        current.append(sentence)
+        if len(" ".join(current).split()) > max_words:
+            chunks.append((" ".join(current), source))
+            current = []
+    if current:
+        chunks.append((" ".join(current), source))
+    return chunks