Spaces:
Configuration error
Configuration error
Upload 6 files
Browse files- README.md +24 -0
- app.app +69 -0
- config.py +6 -0
- document_processor.py +29 -0
- rag_pipeline.py +43 -0
- requirements.txt +8 -0
README.md
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Arabic Document-Based Chatbot System
|
2 |
+
|
3 |
+
A RAG-based chatbot that answers questions from Arabic PDF/Word documents with cited sources.
|
4 |
+
|
5 |
+
## Features
|
6 |
+
- Processes Arabic PDF and Word documents
|
7 |
+
- Answers questions in Arabic with source citations
|
8 |
+
- Clean Arabic interface
|
9 |
+
|
10 |
+
## Usage
|
11 |
+
1. Upload Arabic documents (PDF or DOCX)
|
12 |
+
2. Click "Process Files"
|
13 |
+
3. Ask questions in Arabic
|
14 |
+
4. Get answers with cited sources
|
15 |
+
|
16 |
+
## Deployment on Hugging Face Spaces
|
17 |
+
1. Create new Space
|
18 |
+
2. Upload all files
|
19 |
+
3. Set `app.py` as the main file
|
20 |
+
4. The Space will automatically install dependencies
|
21 |
+
|
22 |
+
## Models Used
|
23 |
+
- LLM: NousResearch/Nous-Hermes-2-Mistral-7B
|
24 |
+
- Embedding Model: paraphrase-multilingual-MiniLM-L12-v2
|
app.app
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from rag_pipeline import ArabicRAGSystem
|
3 |
+
from document_processor import process_pdf, process_docx
|
4 |
+
import os
|
5 |
+
|
6 |
+
rag = ArabicRAGSystem()
|
7 |
+
|
8 |
+
def process_uploaded_files(files):
|
9 |
+
"""Handle uploaded documents"""
|
10 |
+
all_chunks = []
|
11 |
+
for file in files:
|
12 |
+
if file.name.endswith('.pdf'):
|
13 |
+
chunks = process_pdf(file.name)
|
14 |
+
elif file.name.endswith('.docx'):
|
15 |
+
chunks = process_docx(file.name)
|
16 |
+
all_chunks.extend(chunks)
|
17 |
+
|
18 |
+
if all_chunks:
|
19 |
+
rag.build_index(all_chunks)
|
20 |
+
return "تم تحميل المستندات بنجاح! يمكنك الآن طرح الأسئلة."
|
21 |
+
return "حدث خطأ في معالجة الملفات."
|
22 |
+
|
23 |
+
def respond(question, history):
|
24 |
+
"""Generate response to user question"""
|
25 |
+
if not rag.index:
|
26 |
+
return "الرجاء تحميل المستندات أولاً"
|
27 |
+
|
28 |
+
context = rag.retrieve(question)
|
29 |
+
answer = rag.generate_answer(question, context)
|
30 |
+
|
31 |
+
cited_answer = f"{answer}\n\nالمصادر:\n" + "\n".join(
|
32 |
+
f"- {c[:100]}..." for c in context
|
33 |
+
)
|
34 |
+
|
35 |
+
return cited_answer
|
36 |
+
|
37 |
+
with gr.Blocks(title="نظام الدردشة العربي المدعوم بالوثائق", theme=gr.themes.Soft()) as demo:
|
38 |
+
gr.Markdown("## نظام الدردشة العربي المدعوم بالوثائق")
|
39 |
+
|
40 |
+
with gr.Row():
|
41 |
+
with gr.Column():
|
42 |
+
file_output = gr.File(label="تحميل المستندات", file_count="multiple")
|
43 |
+
upload_button = gr.Button("معالجة الملفات")
|
44 |
+
upload_status = gr.Textbox(label="حالة التحميل")
|
45 |
+
|
46 |
+
with gr.Column():
|
47 |
+
chatbot = gr.Chatbot(height=400)
|
48 |
+
question = gr.Textbox(label="اكتب سؤالك هنا")
|
49 |
+
submit = gr.Button("إرسال")
|
50 |
+
|
51 |
+
upload_button.click(
|
52 |
+
process_uploaded_files,
|
53 |
+
inputs=file_output,
|
54 |
+
outputs=upload_status
|
55 |
+
)
|
56 |
+
|
57 |
+
submit.click(
|
58 |
+
respond,
|
59 |
+
inputs=[question, chatbot],
|
60 |
+
outputs=chatbot
|
61 |
+
)
|
62 |
+
question.submit(
|
63 |
+
respond,
|
64 |
+
inputs=[question, chatbot],
|
65 |
+
outputs=chatbot
|
66 |
+
)
|
67 |
+
|
68 |
+
if __name__ == "__main__":
|
69 |
+
demo.launch()
|
config.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MODEL_CONFIG = {
|
2 |
+
"llm": "NousResearch/Nous-Hermes-2-Mistral-7B",
|
3 |
+
"embedding_model": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
|
4 |
+
"chunk_size": 512,
|
5 |
+
"chunk_overlap": 64
|
6 |
+
}
|
document_processor.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import fitz # PyMuPDF
|
3 |
+
from docx import Document
|
4 |
+
from typing import List
|
5 |
+
|
6 |
+
def clean_arabic_text(text: str) -> str:
|
7 |
+
"""Normalize Arabic text and remove diacritics"""
|
8 |
+
text = re.sub(r'[\u064B-\u065F]', '', text) # Remove diacritics
|
9 |
+
text = re.sub(r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\s]', '', text)
|
10 |
+
return text.strip()
|
11 |
+
|
12 |
+
def process_pdf(file_path: str) -> List[str]:
|
13 |
+
"""Extract text from PDF"""
|
14 |
+
doc = fitz.open(file_path)
|
15 |
+
chunks = []
|
16 |
+
for page in doc:
|
17 |
+
text = page.get_text()
|
18 |
+
cleaned = clean_arabic_text(text)
|
19 |
+
if cleaned: chunks.append(cleaned)
|
20 |
+
return chunks
|
21 |
+
|
22 |
+
def process_docx(file_path: str) -> List[str]:
|
23 |
+
"""Extract text from Word document"""
|
24 |
+
doc = Document(file_path)
|
25 |
+
chunks = []
|
26 |
+
for para in doc.paragraphs:
|
27 |
+
cleaned = clean_arabic_text(para.text)
|
28 |
+
if cleaned: chunks.append(cleaned)
|
29 |
+
return chunks
|
rag_pipeline.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sentence_transformers import SentenceTransformer
|
2 |
+
from transformers import pipeline
|
3 |
+
import faiss
|
4 |
+
import numpy as np
|
5 |
+
from config import MODEL_CONFIG
|
6 |
+
|
7 |
+
class ArabicRAGSystem:
|
8 |
+
def __init__(self):
|
9 |
+
self.embedder = SentenceTransformer(MODEL_CONFIG["embedding_model"])
|
10 |
+
self.llm = pipeline("text-generation", model=MODEL_CONFIG["llm"])
|
11 |
+
self.index = None
|
12 |
+
self.documents = []
|
13 |
+
|
14 |
+
def build_index(self, chunks: List[str]):
|
15 |
+
"""Create FAISS index from document chunks"""
|
16 |
+
self.documents = chunks
|
17 |
+
embeddings = self.embedder.encode(chunks, show_progress_bar=True)
|
18 |
+
self.index = faiss.IndexFlatIP(embeddings.shape[1])
|
19 |
+
self.index.add(embeddings)
|
20 |
+
|
21 |
+
def retrieve(self, query: str, k: int = 3) -> List[str]:
|
22 |
+
"""Retrieve relevant document chunks"""
|
23 |
+
query_embedding = self.embedder.encode([query])
|
24 |
+
distances, indices = self.index.search(query_embedding, k)
|
25 |
+
return [self.documents[i] for i in indices[0]]
|
26 |
+
|
27 |
+
def generate_answer(self, question: str, context: List[str]) -> str:
|
28 |
+
"""Generate answer using LLM with retrieved context"""
|
29 |
+
prompt = f"""استخدم المعلومات التالية للإجابة على السؤال:
|
30 |
+
|
31 |
+
السياق:
|
32 |
+
{'\n'.join(context)}
|
33 |
+
|
34 |
+
السؤال: {question}
|
35 |
+
الإجابة:"""
|
36 |
+
|
37 |
+
result = self.llm(
|
38 |
+
prompt,
|
39 |
+
max_new_tokens=256,
|
40 |
+
temperature=0.7,
|
41 |
+
do_sample=True
|
42 |
+
)
|
43 |
+
return result[0]["generated_text"].replace(prompt, "")
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio>=3.0
|
2 |
+
transformers>=4.30
|
3 |
+
sentence-transformers>=2.2.2
|
4 |
+
faiss-cpu>=1.7.4
|
5 |
+
pymupdf>=1.22.5
|
6 |
+
python-docx>=0.8.11
|
7 |
+
torch>=2.0.1
|
8 |
+
accelerate
|