Spaces:

ramysaidagieb
/

rag22v2

Runtime error

App Files Files Community

ramysaidagieb commited on May 22

Commit

c90b40e

verified ·

1 Parent(s): dcccf46

Upload 5 files

Browse files

Files changed (5) hide show

app.py +70 -0
markdown.md +34 -0
rag_pipeline.py +84 -0
requirements.txt +12 -0
utils.py +85 -0

app.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import gradio as gr
+import numpy as np
+from utils import DocumentProcessor
+from rag_pipeline import ArabicRAGSystem
+css = """
+.rtl {direction: rtl; text-align: right;}
+.header {background: #f0f2f6; padding: 20px; border-radius: 10px;}
+.markdown-body {font-family: 'Amiri', serif; font-size: 18px;}
+.highlight {background: #fff3cd; padding: 10px; border-radius: 5px;}
+"""
+with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
+    rag = ArabicRAGSystem()
+    with gr.Column(elem_classes="header"):
+        gr.Markdown("""
+        <div class='rtl'>
+        <h1 style="text-align:center; color: #2B547E;">نظام التحليل اللاهوتي المدعوم بالذكاء الاصطناعي</h1>
+        <p style="text-align:center">نظام لتحليل الكتب الدينية العربية وإجابة الأسئلة مع الإشارة إلى المصادر</p>
+        </div>
+        """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            file_upload = gr.File(label="تحميل الملفات", file_types=[".pdf", ".docx"],
+                               file_count="multiple", elem_classes="rtl")
+            with gr.Accordion("إعدادات البحث", open=False):
+                top_k = gr.Slider(3, 10, value=5, step=1, label="عدد المقاطع المستخدمة")
+                temperature = gr.Slider(0.1, 1.0, value=0.7, label="درجة الإبداعية")
+        with gr.Column(scale=2):
+            question = gr.Textbox(label="اكتب سؤالك هنا", lines=3, elem_classes="rtl")
+            answer = gr.Markdown(label="الإجابة", elem_classes=["markdown-body", "rtl"])
+            sources = gr.DataFrame(label="المصادر المستخدمة",
+                                 headers=["النص", "المصدر", "الصفحة", "الثقة"],
+                                 elem_classes="rtl")
+    def process_query(files, question, top_k, temp):
+        if not files or not question:
+            return "", []
+        processor = DocumentProcessor()
+        documents = processor.process_documents(files)
+        answer_text, sources_data = rag.generate_answer(
+            question=question,
+            documents=documents,
+            top_k=top_k,
+            temperature=temp
+        )
+        formatted_sources = []
+        for src in sources_data:
+            formatted_sources.append([
+                src['text'],
+                src['source'],
+                src['page'],
+                f"{src['score']:.2f}"
+            ])
+        return answer_text, formatted_sources
+    question.submit(
+        process_query,
+        inputs=[file_upload, question, top_k, temperature],
+        outputs=[answer, sources]
+    )
+if __name__ == "__main__":
+    demo.launch()

markdown.md ADDED Viewed

	@@ -0,0 +1,34 @@

+---
+title: "نظام الذكاء الاصطناعي لتحليل النصوص الدينية"
+emoji: "📖"
+colorFrom: "blue"
+colorTo: "indigo"
+sdk: "gradio"
+sdk_version: "4.13.0"
+app_file: "app.py"
+pinned: true
+---
+# نظام التحليل الديني المدعوم بالذكاء الاصطناعي
+## المميزات الرئيسية
+- تحليل متقدم للكتب والمقالات الدينية العربية
+- إجابات مدعومة بمراجع دقيقة من النصوص
+- واجهة مستخدم عربية كاملة (اتجاه من اليمين لليسار)
+- دعم كامل لملفات PDF وDOCX العربية
+- نماذج مفتوحة المصدر ومجانية بالكامل
+## كيفية الاستخدام
+### على منصة Hugging Face:
+1. انتقل إلى صفحة النموذج
+2. اضغط على "تشغيل Space"
+3. انتظر اكتمال التحميل (يستغرق حوالي دقيقتين)
+4. ارفع ملفاتك وابدأ بطرح الأسئلة
+### للتشغيل المحلي:
+```bash
+git clone https://huggingface.co/spaces/[اسم المستخدم]/[اسم المشروع]
+cd [اسم المشروع]
+pip install -r requirements.txt
+python app.py

rag_pipeline.py ADDED Viewed

	@@ -0,0 +1,84 @@

+from sentence_transformers import CrossEncoder, SentenceTransformer
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import faiss
+import numpy as np
+from typing import List, Dict
+class ArabicRAGSystem:
+    def __init__(self):
+        # Initialize models
+        self.embedding_model = SentenceTransformer("aubmindlab/bert-base-arabertv2")
+        self.cross_encoder = CrossEncoder("Arabic-Misc/roberta-base-arabic-camelbert-da-msa")
+        self.tokenizer = AutoTokenizer.from_pretrained("inception-mbzuai/jais-13b-chat")
+        self.llm = AutoModelForCausalLM.from_pretrained("inception-mbzuai/jais-13b-chat")
+        self.index = faiss.IndexFlatL2(768)
+    def _create_index(self, documents: List[Dict]):
+        texts = [doc["text"] for doc in documents]
+        embeddings = self.embedding_model.encode(texts)
+        self.index.add(np.array(embeddings))
+    def generate_answer(self, question: str, documents: List[Dict],
+                      top_k: int = 5, temperature: float = 0.7) -> tuple:
+        # Indexing phase
+        self._create_index(documents)
+        # Two-stage retrieval
+        query_embedding = self.embedding_model.encode([question])
+        distances, indices = self.index.search(query_embedding, top_k*2)
+        # Re-ranking with cross-encoder
+        pairs = [[question, documents[idx]["text"]] for idx in indices[0]]
+        scores = self.cross_encoder.predict(pairs)
+        ranked_indices = np.argsort(scores)[::-1][:top_k]
+        # Prepare context
+        context = "\n\n".join([
+            f"المصدر: {documents[idx]['source']}\n"
+            f"الصفحة: {documents[idx]['page']}\n"
+            f"النص: {documents[idx]['text']}"
+            for idx in [indices[0][i] for i in ranked_indices]
+        ])
+        # Generate answer
+        prompt = f"""
+        أنت خبير في التحليل الديني. قم بالإجابة على السؤال التالي بناءً على السياق المقدم فقط:
+        السياق:
+        {context}
+        السؤال:
+        {question}
+        التعليمات:
+        - أجب باللغة العربية الفصحى
+        - استخدم علامات التنسيق المناسبة
+        - أشر إلى المصادر باستخدام التنسيق [المصدر: اسم الملف، الصفحة: رقم]
+        - إذا لم توجد إجابة واضحة، قل "لا تتوفر معلومات كافية"
+        الإجابة:
+        """.strip()
+        inputs = self.tokenizer(prompt, return_tensors="pt")
+        outputs = self.llm.generate(
+            inputs.input_ids,
+            max_new_tokens=512,
+            temperature=temperature,
+            do_sample=True,
+            pad_token_id=self.tokenizer.eos_token_id
+        )
+        answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+        answer = answer.split("الإجابة:")[-1].strip()
+        # Prepare sources
+        sources = []
+        for idx in [indices[0][i] for i in ranked_indices]:
+            sources.append({
+                "text": documents[idx]["text"],
+                "source": documents[idx]["source"],
+                "page": documents[idx]["page"],
+                "score": float(scores[idx])
+            })
+        return answer, sources

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+gradio>=3.50
+pymupdf>=1.23.0
+python-docx>=0.8.11
+sentence-transformers>=2.3.1
+faiss-cpu>=1.7.4
+transformers>=4.38.0
+pyarabic>=0.6.14
+langchain>=0.1.0
+torch>=2.0.0
+safetensors>=0.4.0
+arabic-reshaper>=2.1.4
+python-bidi>=0.4.2

utils.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import fitz
+from docx import Document
+import re
+import pyarabic.araby as araby
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from typing import List, Dict
+class DocumentProcessor:
+    def __init__(self, chunk_size=512, chunk_overlap=64):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap,
+            separators=["\n\n", "۔", ".", "؟", "!", "\n"]
+        )
+    def _normalize_arabic(self, text: str) -> str:
+        text = araby.strip_diacritics(text)
+        text = araby.normalize_ligatures(text)
+        text = araby.normalize_hamza(text)
+        return re.sub(r'\s+', ' ', text).strip()
+    def _process_pdf(self, file_path: str) -> List[Dict]:
+        doc = fitz.open(file_path)
+        pages = []
+        for page_num, page in enumerate(doc):
+            text = ""
+            blocks = page.get_text("dict")["blocks"]
+            for block in blocks:
+                if "lines" in block:
+                    for line in block["lines"]:
+                        for span in line["spans"]:
+                            if span["flags"] & 16:  # Bold text
+                                text += f"**{span['text']}** "
+                            else:
+                                text += span["text"] + " "
+            pages.append({
+                "text": self._normalize_arabic(text),
+                "source": file_path,
+                "page": page_num + 1
+            })
+        return pages
+    def _process_docx(self, file_path: str) -> List[Dict]:
+        doc = Document(file_path)
+        sections = []
+        current_section = {"text": "", "source": file_path, "page": 1}
+        for para in doc.paragraphs:
+            if para.style.name.startswith('Heading'):
+                if current_section["text"]:
+                    sections.append(current_section)
+                    current_section = {"text": "", "source": file_path, "page": len(sections)+1}
+                current_section["text"] += f"\n# {para.text}\n"
+            else:
+                current_section["text"] += para.text + "\n"
+        if current_section["text"]:
+            sections.append(current_section)
+        return [{
+            "text": self._normalize_arabic(s["text"]),
+            "source": s["source"],
+            "page": s["page"]
+        } for s in sections]
+    def process_documents(self, files: List) -> List[Dict]:
+        all_chunks = []
+        for file_info in files:
+            if file_info.name.endswith(".pdf"):
+                pages = self._process_pdf(file_info.name)
+            elif file_info.name.endswith(".docx"):
+                pages = self._process_docx(file_info.name)
+            else:
+                continue
+            for page in pages:
+                chunks = self.text_splitter.split_text(page["text"])
+                for chunk in chunks:
+                    all_chunks.append({
+                        "text": chunk,
+                        "source": page["source"],
+                        "page": page["page"]
+                    })
+        return all_chunks