Spaces:
Runtime error
Runtime error
Upload 5 files
Browse files- app.py +70 -0
- markdown.md +34 -0
- rag_pipeline.py +84 -0
- requirements.txt +12 -0
- utils.py +85 -0
app.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import numpy as np
|
3 |
+
from utils import DocumentProcessor
|
4 |
+
from rag_pipeline import ArabicRAGSystem
|
5 |
+
|
6 |
+
css = """
|
7 |
+
.rtl {direction: rtl; text-align: right;}
|
8 |
+
.header {background: #f0f2f6; padding: 20px; border-radius: 10px;}
|
9 |
+
.markdown-body {font-family: 'Amiri', serif; font-size: 18px;}
|
10 |
+
.highlight {background: #fff3cd; padding: 10px; border-radius: 5px;}
|
11 |
+
"""
|
12 |
+
|
13 |
+
with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
|
14 |
+
rag = ArabicRAGSystem()
|
15 |
+
|
16 |
+
with gr.Column(elem_classes="header"):
|
17 |
+
gr.Markdown("""
|
18 |
+
<div class='rtl'>
|
19 |
+
<h1 style="text-align:center; color: #2B547E;">نظام التحليل اللاهوتي المدعوم بالذكاء الاصطناعي</h1>
|
20 |
+
<p style="text-align:center">نظام لتحليل الكتب الدينية العربية وإجابة الأسئلة مع الإشارة إلى المصادر</p>
|
21 |
+
</div>
|
22 |
+
""")
|
23 |
+
|
24 |
+
with gr.Row():
|
25 |
+
with gr.Column(scale=1):
|
26 |
+
file_upload = gr.File(label="تحميل الملفات", file_types=[".pdf", ".docx"],
|
27 |
+
file_count="multiple", elem_classes="rtl")
|
28 |
+
with gr.Accordion("إعدادات البحث", open=False):
|
29 |
+
top_k = gr.Slider(3, 10, value=5, step=1, label="عدد المقاطع المستخدمة")
|
30 |
+
temperature = gr.Slider(0.1, 1.0, value=0.7, label="درجة الإبداعية")
|
31 |
+
|
32 |
+
with gr.Column(scale=2):
|
33 |
+
question = gr.Textbox(label="اكتب سؤالك هنا", lines=3, elem_classes="rtl")
|
34 |
+
answer = gr.Markdown(label="الإجابة", elem_classes=["markdown-body", "rtl"])
|
35 |
+
sources = gr.DataFrame(label="المصادر المستخدمة",
|
36 |
+
headers=["النص", "المصدر", "الصفحة", "الثقة"],
|
37 |
+
elem_classes="rtl")
|
38 |
+
|
39 |
+
def process_query(files, question, top_k, temp):
|
40 |
+
if not files or not question:
|
41 |
+
return "", []
|
42 |
+
|
43 |
+
processor = DocumentProcessor()
|
44 |
+
documents = processor.process_documents(files)
|
45 |
+
answer_text, sources_data = rag.generate_answer(
|
46 |
+
question=question,
|
47 |
+
documents=documents,
|
48 |
+
top_k=top_k,
|
49 |
+
temperature=temp
|
50 |
+
)
|
51 |
+
|
52 |
+
formatted_sources = []
|
53 |
+
for src in sources_data:
|
54 |
+
formatted_sources.append([
|
55 |
+
src['text'],
|
56 |
+
src['source'],
|
57 |
+
src['page'],
|
58 |
+
f"{src['score']:.2f}"
|
59 |
+
])
|
60 |
+
|
61 |
+
return answer_text, formatted_sources
|
62 |
+
|
63 |
+
question.submit(
|
64 |
+
process_query,
|
65 |
+
inputs=[file_upload, question, top_k, temperature],
|
66 |
+
outputs=[answer, sources]
|
67 |
+
)
|
68 |
+
|
69 |
+
if __name__ == "__main__":
|
70 |
+
demo.launch()
|
markdown.md
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: "نظام الذكاء الاصطناعي لتحليل النصوص الدينية"
|
3 |
+
emoji: "📖"
|
4 |
+
colorFrom: "blue"
|
5 |
+
colorTo: "indigo"
|
6 |
+
sdk: "gradio"
|
7 |
+
sdk_version: "4.13.0"
|
8 |
+
app_file: "app.py"
|
9 |
+
pinned: true
|
10 |
+
---
|
11 |
+
|
12 |
+
# نظام التحليل الديني المدعوم بالذكاء الاصطناعي
|
13 |
+
|
14 |
+
## المميزات الرئيسية
|
15 |
+
- تحليل متقدم للكتب والمقالات الدينية العربية
|
16 |
+
- إجابات مدعومة بمراجع دقيقة من النصوص
|
17 |
+
- واجهة مستخدم عربية كاملة (اتجاه من اليمين لليسار)
|
18 |
+
- دعم كامل لملفات PDF وDOCX العربية
|
19 |
+
- نماذج مفتوحة المصدر ومجانية بالكامل
|
20 |
+
|
21 |
+
## كيفية الاستخدام
|
22 |
+
|
23 |
+
### على منصة Hugging Face:
|
24 |
+
1. انتقل إلى صفحة النموذج
|
25 |
+
2. اضغط على "تشغيل Space"
|
26 |
+
3. انتظر اكتمال التحميل (يستغرق حوالي دقيقتين)
|
27 |
+
4. ارفع ملفاتك وابدأ بطرح الأسئلة
|
28 |
+
|
29 |
+
### للتشغيل المحلي:
|
30 |
+
```bash
|
31 |
+
git clone https://huggingface.co/spaces/[اسم المستخدم]/[اسم المشروع]
|
32 |
+
cd [اسم المشروع]
|
33 |
+
pip install -r requirements.txt
|
34 |
+
python app.py
|
rag_pipeline.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sentence_transformers import CrossEncoder, SentenceTransformer
|
2 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
3 |
+
import faiss
|
4 |
+
import numpy as np
|
5 |
+
from typing import List, Dict
|
6 |
+
|
7 |
+
class ArabicRAGSystem:
|
8 |
+
def __init__(self):
|
9 |
+
# Initialize models
|
10 |
+
self.embedding_model = SentenceTransformer("aubmindlab/bert-base-arabertv2")
|
11 |
+
self.cross_encoder = CrossEncoder("Arabic-Misc/roberta-base-arabic-camelbert-da-msa")
|
12 |
+
self.tokenizer = AutoTokenizer.from_pretrained("inception-mbzuai/jais-13b-chat")
|
13 |
+
self.llm = AutoModelForCausalLM.from_pretrained("inception-mbzuai/jais-13b-chat")
|
14 |
+
self.index = faiss.IndexFlatL2(768)
|
15 |
+
|
16 |
+
def _create_index(self, documents: List[Dict]):
|
17 |
+
texts = [doc["text"] for doc in documents]
|
18 |
+
embeddings = self.embedding_model.encode(texts)
|
19 |
+
self.index.add(np.array(embeddings))
|
20 |
+
|
21 |
+
def generate_answer(self, question: str, documents: List[Dict],
|
22 |
+
top_k: int = 5, temperature: float = 0.7) -> tuple:
|
23 |
+
# Indexing phase
|
24 |
+
self._create_index(documents)
|
25 |
+
|
26 |
+
# Two-stage retrieval
|
27 |
+
query_embedding = self.embedding_model.encode([question])
|
28 |
+
distances, indices = self.index.search(query_embedding, top_k*2)
|
29 |
+
|
30 |
+
# Re-ranking with cross-encoder
|
31 |
+
pairs = [[question, documents[idx]["text"]] for idx in indices[0]]
|
32 |
+
scores = self.cross_encoder.predict(pairs)
|
33 |
+
ranked_indices = np.argsort(scores)[::-1][:top_k]
|
34 |
+
|
35 |
+
# Prepare context
|
36 |
+
context = "\n\n".join([
|
37 |
+
f"المصدر: {documents[idx]['source']}\n"
|
38 |
+
f"الصفحة: {documents[idx]['page']}\n"
|
39 |
+
f"النص: {documents[idx]['text']}"
|
40 |
+
for idx in [indices[0][i] for i in ranked_indices]
|
41 |
+
])
|
42 |
+
|
43 |
+
# Generate answer
|
44 |
+
prompt = f"""
|
45 |
+
أنت خبير في التحليل الديني. قم بالإجابة على السؤال التالي بناءً على السياق المقدم فقط:
|
46 |
+
|
47 |
+
السياق:
|
48 |
+
{context}
|
49 |
+
|
50 |
+
السؤال:
|
51 |
+
{question}
|
52 |
+
|
53 |
+
التعليمات:
|
54 |
+
- أجب باللغة العربية الفصحى
|
55 |
+
- استخدم علامات التنسيق المناسبة
|
56 |
+
- أشر إلى المصادر باستخدام التنسيق [المصدر: اسم الملف، الصفحة: رقم]
|
57 |
+
- إذا لم توجد إجابة واضحة، قل "لا تتوفر معلومات كافية"
|
58 |
+
|
59 |
+
الإجابة:
|
60 |
+
""".strip()
|
61 |
+
|
62 |
+
inputs = self.tokenizer(prompt, return_tensors="pt")
|
63 |
+
outputs = self.llm.generate(
|
64 |
+
inputs.input_ids,
|
65 |
+
max_new_tokens=512,
|
66 |
+
temperature=temperature,
|
67 |
+
do_sample=True,
|
68 |
+
pad_token_id=self.tokenizer.eos_token_id
|
69 |
+
)
|
70 |
+
|
71 |
+
answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
72 |
+
answer = answer.split("الإجابة:")[-1].strip()
|
73 |
+
|
74 |
+
# Prepare sources
|
75 |
+
sources = []
|
76 |
+
for idx in [indices[0][i] for i in ranked_indices]:
|
77 |
+
sources.append({
|
78 |
+
"text": documents[idx]["text"],
|
79 |
+
"source": documents[idx]["source"],
|
80 |
+
"page": documents[idx]["page"],
|
81 |
+
"score": float(scores[idx])
|
82 |
+
})
|
83 |
+
|
84 |
+
return answer, sources
|
requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio>=3.50
|
2 |
+
pymupdf>=1.23.0
|
3 |
+
python-docx>=0.8.11
|
4 |
+
sentence-transformers>=2.3.1
|
5 |
+
faiss-cpu>=1.7.4
|
6 |
+
transformers>=4.38.0
|
7 |
+
pyarabic>=0.6.14
|
8 |
+
langchain>=0.1.0
|
9 |
+
torch>=2.0.0
|
10 |
+
safetensors>=0.4.0
|
11 |
+
arabic-reshaper>=2.1.4
|
12 |
+
python-bidi>=0.4.2
|
utils.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import fitz
|
2 |
+
from docx import Document
|
3 |
+
import re
|
4 |
+
import pyarabic.araby as araby
|
5 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
6 |
+
from typing import List, Dict
|
7 |
+
|
8 |
+
class DocumentProcessor:
|
9 |
+
def __init__(self, chunk_size=512, chunk_overlap=64):
|
10 |
+
self.chunk_size = chunk_size
|
11 |
+
self.chunk_overlap = chunk_overlap
|
12 |
+
self.text_splitter = RecursiveCharacterTextSplitter(
|
13 |
+
chunk_size=chunk_size,
|
14 |
+
chunk_overlap=chunk_overlap,
|
15 |
+
separators=["\n\n", "۔", ".", "؟", "!", "\n"]
|
16 |
+
)
|
17 |
+
|
18 |
+
def _normalize_arabic(self, text: str) -> str:
|
19 |
+
text = araby.strip_diacritics(text)
|
20 |
+
text = araby.normalize_ligatures(text)
|
21 |
+
text = araby.normalize_hamza(text)
|
22 |
+
return re.sub(r'\s+', ' ', text).strip()
|
23 |
+
|
24 |
+
def _process_pdf(self, file_path: str) -> List[Dict]:
|
25 |
+
doc = fitz.open(file_path)
|
26 |
+
pages = []
|
27 |
+
for page_num, page in enumerate(doc):
|
28 |
+
text = ""
|
29 |
+
blocks = page.get_text("dict")["blocks"]
|
30 |
+
for block in blocks:
|
31 |
+
if "lines" in block:
|
32 |
+
for line in block["lines"]:
|
33 |
+
for span in line["spans"]:
|
34 |
+
if span["flags"] & 16: # Bold text
|
35 |
+
text += f"**{span['text']}** "
|
36 |
+
else:
|
37 |
+
text += span["text"] + " "
|
38 |
+
pages.append({
|
39 |
+
"text": self._normalize_arabic(text),
|
40 |
+
"source": file_path,
|
41 |
+
"page": page_num + 1
|
42 |
+
})
|
43 |
+
return pages
|
44 |
+
|
45 |
+
def _process_docx(self, file_path: str) -> List[Dict]:
|
46 |
+
doc = Document(file_path)
|
47 |
+
sections = []
|
48 |
+
current_section = {"text": "", "source": file_path, "page": 1}
|
49 |
+
|
50 |
+
for para in doc.paragraphs:
|
51 |
+
if para.style.name.startswith('Heading'):
|
52 |
+
if current_section["text"]:
|
53 |
+
sections.append(current_section)
|
54 |
+
current_section = {"text": "", "source": file_path, "page": len(sections)+1}
|
55 |
+
current_section["text"] += f"\n# {para.text}\n"
|
56 |
+
else:
|
57 |
+
current_section["text"] += para.text + "\n"
|
58 |
+
|
59 |
+
if current_section["text"]:
|
60 |
+
sections.append(current_section)
|
61 |
+
return [{
|
62 |
+
"text": self._normalize_arabic(s["text"]),
|
63 |
+
"source": s["source"],
|
64 |
+
"page": s["page"]
|
65 |
+
} for s in sections]
|
66 |
+
|
67 |
+
def process_documents(self, files: List) -> List[Dict]:
|
68 |
+
all_chunks = []
|
69 |
+
for file_info in files:
|
70 |
+
if file_info.name.endswith(".pdf"):
|
71 |
+
pages = self._process_pdf(file_info.name)
|
72 |
+
elif file_info.name.endswith(".docx"):
|
73 |
+
pages = self._process_docx(file_info.name)
|
74 |
+
else:
|
75 |
+
continue
|
76 |
+
|
77 |
+
for page in pages:
|
78 |
+
chunks = self.text_splitter.split_text(page["text"])
|
79 |
+
for chunk in chunks:
|
80 |
+
all_chunks.append({
|
81 |
+
"text": chunk,
|
82 |
+
"source": page["source"],
|
83 |
+
"page": page["page"]
|
84 |
+
})
|
85 |
+
return all_chunks
|