Spaces:

ramysaidagieb
/

rag22V1

Configuration error

App Files Files Community

ramysaidagieb commited on May 22

Commit

74e2822

verified ·

1 Parent(s): 849d650

Upload 6 files

Browse files

Files changed (6) hide show

README.md +24 -0
app.app +69 -0
config.py +6 -0
document_processor.py +29 -0
rag_pipeline.py +43 -0
requirements.txt +8 -0

README.md ADDED Viewed

	@@ -0,0 +1,24 @@

+# Arabic Document-Based Chatbot System
+A RAG-based chatbot that answers questions from Arabic PDF/Word documents with cited sources.
+## Features
+- Processes Arabic PDF and Word documents
+- Answers questions in Arabic with source citations
+- Clean Arabic interface
+## Usage
+1. Upload Arabic documents (PDF or DOCX)
+2. Click "Process Files"
+3. Ask questions in Arabic
+4. Get answers with cited sources
+## Deployment on Hugging Face Spaces
+1. Create new Space
+2. Upload all files
+3. Set `app.py` as the main file
+4. The Space will automatically install dependencies
+## Models Used
+- LLM: NousResearch/Nous-Hermes-2-Mistral-7B
+- Embedding Model: paraphrase-multilingual-MiniLM-L12-v2

app.app ADDED Viewed

	@@ -0,0 +1,69 @@

+import gradio as gr
+from rag_pipeline import ArabicRAGSystem
+from document_processor import process_pdf, process_docx
+import os
+rag = ArabicRAGSystem()
+def process_uploaded_files(files):
+    """Handle uploaded documents"""
+    all_chunks = []
+    for file in files:
+        if file.name.endswith('.pdf'):
+            chunks = process_pdf(file.name)
+        elif file.name.endswith('.docx'):
+            chunks = process_docx(file.name)
+        all_chunks.extend(chunks)
+    if all_chunks:
+        rag.build_index(all_chunks)
+        return "تم تحميل المستندات بنجاح! يمكنك الآن طرح الأسئلة."
+    return "حدث خطأ في معالجة الملفات."
+def respond(question, history):
+    """Generate response to user question"""
+    if not rag.index:
+        return "الرجاء تحميل المستندات أولاً"
+    context = rag.retrieve(question)
+    answer = rag.generate_answer(question, context)
+    cited_answer = f"{answer}\n\nالمصادر:\n" + "\n".join(
+        f"- {c[:100]}..." for c in context
+    )
+    return cited_answer
+with gr.Blocks(title="نظام الدردشة العربي المدعوم بالوثائق", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("## نظام الدردشة العربي المدعوم بالوثائق")
+    with gr.Row():
+        with gr.Column():
+            file_output = gr.File(label="تحميل المستندات", file_count="multiple")
+            upload_button = gr.Button("معالجة الملفات")
+            upload_status = gr.Textbox(label="حالة التحميل")
+        with gr.Column():
+            chatbot = gr.Chatbot(height=400)
+            question = gr.Textbox(label="اكتب سؤالك هنا")
+            submit = gr.Button("إرسال")
+    upload_button.click(
+        process_uploaded_files,
+        inputs=file_output,
+        outputs=upload_status
+    )
+    submit.click(
+        respond,
+        inputs=[question, chatbot],
+        outputs=chatbot
+    )
+    question.submit(
+        respond,
+        inputs=[question, chatbot],
+        outputs=chatbot
+    )
+if __name__ == "__main__":
+    demo.launch()

config.py ADDED Viewed

	@@ -0,0 +1,6 @@

+MODEL_CONFIG = {
+    "llm": "NousResearch/Nous-Hermes-2-Mistral-7B",
+    "embedding_model": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
+    "chunk_size": 512,
+    "chunk_overlap": 64
+}

document_processor.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import re
+import fitz  # PyMuPDF
+from docx import Document
+from typing import List
+def clean_arabic_text(text: str) -> str:
+    """Normalize Arabic text and remove diacritics"""
+    text = re.sub(r'[\u064B-\u065F]', '', text)  # Remove diacritics
+    text = re.sub(r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\s]', '', text)
+    return text.strip()
+def process_pdf(file_path: str) -> List[str]:
+    """Extract text from PDF"""
+    doc = fitz.open(file_path)
+    chunks = []
+    for page in doc:
+        text = page.get_text()
+        cleaned = clean_arabic_text(text)
+        if cleaned: chunks.append(cleaned)
+    return chunks
+def process_docx(file_path: str) -> List[str]:
+    """Extract text from Word document"""
+    doc = Document(file_path)
+    chunks = []
+    for para in doc.paragraphs:
+        cleaned = clean_arabic_text(para.text)
+        if cleaned: chunks.append(cleaned)
+    return chunks

rag_pipeline.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from sentence_transformers import SentenceTransformer
+from transformers import pipeline
+import faiss
+import numpy as np
+from config import MODEL_CONFIG
+class ArabicRAGSystem:
+    def __init__(self):
+        self.embedder = SentenceTransformer(MODEL_CONFIG["embedding_model"])
+        self.llm = pipeline("text-generation", model=MODEL_CONFIG["llm"])
+        self.index = None
+        self.documents = []
+    def build_index(self, chunks: List[str]):
+        """Create FAISS index from document chunks"""
+        self.documents = chunks
+        embeddings = self.embedder.encode(chunks, show_progress_bar=True)
+        self.index = faiss.IndexFlatIP(embeddings.shape[1])
+        self.index.add(embeddings)
+    def retrieve(self, query: str, k: int = 3) -> List[str]:
+        """Retrieve relevant document chunks"""
+        query_embedding = self.embedder.encode([query])
+        distances, indices = self.index.search(query_embedding, k)
+        return [self.documents[i] for i in indices[0]]
+    def generate_answer(self, question: str, context: List[str]) -> str:
+        """Generate answer using LLM with retrieved context"""
+        prompt = f"""استخدم المعلومات التالية للإجابة على السؤال:
+        السياق:
+        {'\n'.join(context)}
+        السؤال: {question}
+        الإجابة:"""
+        result = self.llm(
+            prompt,
+            max_new_tokens=256,
+            temperature=0.7,
+            do_sample=True
+        )
+        return result[0]["generated_text"].replace(prompt, "")

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+gradio>=3.0
+transformers>=4.30
+sentence-transformers>=2.2.2
+faiss-cpu>=1.7.4
+pymupdf>=1.22.5
+python-docx>=0.8.11
+torch>=2.0.1
+accelerate