ramysaidagieb commited on
Commit
74e2822
·
verified ·
1 Parent(s): 849d650

Upload 6 files

Browse files
Files changed (6) hide show
  1. README.md +24 -0
  2. app.app +69 -0
  3. config.py +6 -0
  4. document_processor.py +29 -0
  5. rag_pipeline.py +43 -0
  6. requirements.txt +8 -0
README.md ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Arabic Document-Based Chatbot System
2
+
3
+ A RAG-based chatbot that answers questions from Arabic PDF/Word documents with cited sources.
4
+
5
+ ## Features
6
+ - Processes Arabic PDF and Word documents
7
+ - Answers questions in Arabic with source citations
8
+ - Clean Arabic interface
9
+
10
+ ## Usage
11
+ 1. Upload Arabic documents (PDF or DOCX)
12
+ 2. Click "Process Files"
13
+ 3. Ask questions in Arabic
14
+ 4. Get answers with cited sources
15
+
16
+ ## Deployment on Hugging Face Spaces
17
+ 1. Create new Space
18
+ 2. Upload all files
19
+ 3. Set `app.py` as the main file
20
+ 4. The Space will automatically install dependencies
21
+
22
+ ## Models Used
23
+ - LLM: NousResearch/Nous-Hermes-2-Mistral-7B
24
+ - Embedding Model: paraphrase-multilingual-MiniLM-L12-v2
app.app ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from rag_pipeline import ArabicRAGSystem
3
+ from document_processor import process_pdf, process_docx
4
+ import os
5
+
6
+ rag = ArabicRAGSystem()
7
+
8
+ def process_uploaded_files(files):
9
+ """Handle uploaded documents"""
10
+ all_chunks = []
11
+ for file in files:
12
+ if file.name.endswith('.pdf'):
13
+ chunks = process_pdf(file.name)
14
+ elif file.name.endswith('.docx'):
15
+ chunks = process_docx(file.name)
16
+ all_chunks.extend(chunks)
17
+
18
+ if all_chunks:
19
+ rag.build_index(all_chunks)
20
+ return "تم تحميل المستندات بنجاح! يمكنك الآن طرح الأسئلة."
21
+ return "حدث خطأ في معالجة الملفات."
22
+
23
+ def respond(question, history):
24
+ """Generate response to user question"""
25
+ if not rag.index:
26
+ return "الرجاء تحميل المستندات أولاً"
27
+
28
+ context = rag.retrieve(question)
29
+ answer = rag.generate_answer(question, context)
30
+
31
+ cited_answer = f"{answer}\n\nالمصادر:\n" + "\n".join(
32
+ f"- {c[:100]}..." for c in context
33
+ )
34
+
35
+ return cited_answer
36
+
37
+ with gr.Blocks(title="نظام الدردشة العربي المدعوم بالوثائق", theme=gr.themes.Soft()) as demo:
38
+ gr.Markdown("## نظام الدردشة العربي المدعوم بالوثائق")
39
+
40
+ with gr.Row():
41
+ with gr.Column():
42
+ file_output = gr.File(label="تحميل المستندات", file_count="multiple")
43
+ upload_button = gr.Button("معالجة الملفات")
44
+ upload_status = gr.Textbox(label="حالة التحميل")
45
+
46
+ with gr.Column():
47
+ chatbot = gr.Chatbot(height=400)
48
+ question = gr.Textbox(label="اكتب سؤالك هنا")
49
+ submit = gr.Button("إرسال")
50
+
51
+ upload_button.click(
52
+ process_uploaded_files,
53
+ inputs=file_output,
54
+ outputs=upload_status
55
+ )
56
+
57
+ submit.click(
58
+ respond,
59
+ inputs=[question, chatbot],
60
+ outputs=chatbot
61
+ )
62
+ question.submit(
63
+ respond,
64
+ inputs=[question, chatbot],
65
+ outputs=chatbot
66
+ )
67
+
68
+ if __name__ == "__main__":
69
+ demo.launch()
config.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ MODEL_CONFIG = {
2
+ "llm": "NousResearch/Nous-Hermes-2-Mistral-7B",
3
+ "embedding_model": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
4
+ "chunk_size": 512,
5
+ "chunk_overlap": 64
6
+ }
document_processor.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import fitz # PyMuPDF
3
+ from docx import Document
4
+ from typing import List
5
+
6
+ def clean_arabic_text(text: str) -> str:
7
+ """Normalize Arabic text and remove diacritics"""
8
+ text = re.sub(r'[\u064B-\u065F]', '', text) # Remove diacritics
9
+ text = re.sub(r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\s]', '', text)
10
+ return text.strip()
11
+
12
+ def process_pdf(file_path: str) -> List[str]:
13
+ """Extract text from PDF"""
14
+ doc = fitz.open(file_path)
15
+ chunks = []
16
+ for page in doc:
17
+ text = page.get_text()
18
+ cleaned = clean_arabic_text(text)
19
+ if cleaned: chunks.append(cleaned)
20
+ return chunks
21
+
22
+ def process_docx(file_path: str) -> List[str]:
23
+ """Extract text from Word document"""
24
+ doc = Document(file_path)
25
+ chunks = []
26
+ for para in doc.paragraphs:
27
+ cleaned = clean_arabic_text(para.text)
28
+ if cleaned: chunks.append(cleaned)
29
+ return chunks
rag_pipeline.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ from transformers import pipeline
3
+ import faiss
4
+ import numpy as np
5
+ from config import MODEL_CONFIG
6
+
7
+ class ArabicRAGSystem:
8
+ def __init__(self):
9
+ self.embedder = SentenceTransformer(MODEL_CONFIG["embedding_model"])
10
+ self.llm = pipeline("text-generation", model=MODEL_CONFIG["llm"])
11
+ self.index = None
12
+ self.documents = []
13
+
14
+ def build_index(self, chunks: List[str]):
15
+ """Create FAISS index from document chunks"""
16
+ self.documents = chunks
17
+ embeddings = self.embedder.encode(chunks, show_progress_bar=True)
18
+ self.index = faiss.IndexFlatIP(embeddings.shape[1])
19
+ self.index.add(embeddings)
20
+
21
+ def retrieve(self, query: str, k: int = 3) -> List[str]:
22
+ """Retrieve relevant document chunks"""
23
+ query_embedding = self.embedder.encode([query])
24
+ distances, indices = self.index.search(query_embedding, k)
25
+ return [self.documents[i] for i in indices[0]]
26
+
27
+ def generate_answer(self, question: str, context: List[str]) -> str:
28
+ """Generate answer using LLM with retrieved context"""
29
+ prompt = f"""استخدم المعلومات التالية للإجابة على السؤال:
30
+
31
+ السياق:
32
+ {'\n'.join(context)}
33
+
34
+ السؤال: {question}
35
+ الإجابة:"""
36
+
37
+ result = self.llm(
38
+ prompt,
39
+ max_new_tokens=256,
40
+ temperature=0.7,
41
+ do_sample=True
42
+ )
43
+ return result[0]["generated_text"].replace(prompt, "")
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio>=3.0
2
+ transformers>=4.30
3
+ sentence-transformers>=2.2.2
4
+ faiss-cpu>=1.7.4
5
+ pymupdf>=1.22.5
6
+ python-docx>=0.8.11
7
+ torch>=2.0.1
8
+ accelerate