Spaces:
Sleeping
Sleeping
Upload 5 files
Browse files- README.md +7 -8
- app.py +32 -0
- rag_pipeline.py +31 -0
- requirements.txt +7 -0
- utils.py +17 -0
README.md
CHANGED
@@ -1,14 +1,13 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version:
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
-
license: mit
|
11 |
-
short_description: ask questions
|
12 |
---
|
13 |
|
14 |
-
|
|
|
|
1 |
---
|
2 |
+
title: Arabic RAG Assistant
|
3 |
+
emoji: 📚
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: purple
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.25.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
|
|
|
|
10 |
---
|
11 |
|
12 |
+
# Arabic RAG Assistant
|
13 |
+
مساعد بحث عربي يعتمد على الذكاء الاصطناعي المفتوح المصدر للإجابة على الأسئلة بناءً على مستندات PDF / DOCX / TXT المرفوعة.
|
app.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from rag_pipeline import RAGPipeline
|
3 |
+
import time
|
4 |
+
|
5 |
+
rag = RAGPipeline()
|
6 |
+
|
7 |
+
def submit_question(user_question):
|
8 |
+
start_time = time.time()
|
9 |
+
response, passages = rag.answer_question(user_question)
|
10 |
+
end_time = time.time()
|
11 |
+
log = f"[⏱️] زمن الإجابة: {end_time - start_time:.2f} ثانية\n"
|
12 |
+
for i, passage in enumerate(passages):
|
13 |
+
log += f"[📘] المرجع {i+1}: {passage}\n"
|
14 |
+
return response, log
|
15 |
+
|
16 |
+
with gr.Blocks() as demo:
|
17 |
+
with gr.Row():
|
18 |
+
with gr.Column():
|
19 |
+
file_uploader = gr.File(file_types=[".pdf", ".docx", ".txt"], label="📂 رفع المستندات", file_count="multiple")
|
20 |
+
upload_btn = gr.Button("⬆️ رفع ومعالجة")
|
21 |
+
upload_log = gr.Textbox(label="🔍 سجل المعالجة", lines=10)
|
22 |
+
|
23 |
+
with gr.Column():
|
24 |
+
question = gr.Textbox(label="❓ اطرح سؤالك هنا")
|
25 |
+
submit_btn = gr.Button("🔎 إرسال السؤال")
|
26 |
+
answer = gr.Textbox(label="📜 الإجابة", lines=5)
|
27 |
+
sources = gr.Textbox(label="🧭 المراجع", lines=10)
|
28 |
+
|
29 |
+
upload_btn.click(fn=rag.load_and_index, inputs=[file_uploader], outputs=[upload_log])
|
30 |
+
submit_btn.click(fn=submit_question, inputs=[question], outputs=[answer, sources])
|
31 |
+
|
32 |
+
demo.launch()
|
rag_pipeline.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sentence_transformers import SentenceTransformer
|
2 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
|
3 |
+
from langchain.vectorstores import Chroma
|
4 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
5 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
6 |
+
from utils import extract_text_from_files
|
7 |
+
|
8 |
+
class RAGPipeline:
|
9 |
+
def __init__(self):
|
10 |
+
print("[RAG] جاري تحميل النموذج والمحول...")
|
11 |
+
self.embedding_model = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-base")
|
12 |
+
self.generator = pipeline("text-generation", model="tiiuae/falcon-7b-instruct", trust_remote_code=True, device_map="auto")
|
13 |
+
self.db = None
|
14 |
+
print("[RAG] تم التحميل بنجاح.")
|
15 |
+
|
16 |
+
def load_and_index(self, files):
|
17 |
+
text = extract_text_from_files(files)
|
18 |
+
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
19 |
+
chunks = splitter.split_text(text)
|
20 |
+
self.db = Chroma.from_texts(chunks, embedding=self.embedding_model)
|
21 |
+
return f"[RAG] تم بناء الفهرس لـ {len(chunks)} مقاطع."
|
22 |
+
|
23 |
+
def answer_question(self, question):
|
24 |
+
if self.db is None:
|
25 |
+
return "⚠️ لم يتم تحميل مستندات.", []
|
26 |
+
docs = self.db.similarity_search(question, k=3)
|
27 |
+
context = "\n".join([doc.page_content for doc in docs])
|
28 |
+
prompt = f"أجب عن السؤال التالي بناءً على المراجع التالية فقط:\n{context}\n\nالسؤال: {question}\nالإجابة:"
|
29 |
+
result = self.generator(prompt, max_new_tokens=200)[0]["generated_text"]
|
30 |
+
answer = result.split("الإجابة:")[-1].strip()
|
31 |
+
return answer, [doc.page_content for doc in docs]
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio==4.25.0
|
2 |
+
transformers==4.40.1
|
3 |
+
sentence-transformers
|
4 |
+
langchain==0.1.20
|
5 |
+
chromadb==0.4.24
|
6 |
+
PyPDF2
|
7 |
+
python-docx
|
utils.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from PyPDF2 import PdfReader
|
2 |
+
import docx
|
3 |
+
|
4 |
+
def extract_text_from_files(files):
|
5 |
+
all_text = ""
|
6 |
+
for file in files:
|
7 |
+
if file.name.endswith(".pdf"):
|
8 |
+
reader = PdfReader(file)
|
9 |
+
for page in reader.pages:
|
10 |
+
all_text += page.extract_text() + "\n"
|
11 |
+
elif file.name.endswith(".docx"):
|
12 |
+
doc = docx.Document(file)
|
13 |
+
for para in doc.paragraphs:
|
14 |
+
all_text += para.text + "\n"
|
15 |
+
elif file.name.endswith(".txt"):
|
16 |
+
all_text += file.read().decode("utf-8") + "\n"
|
17 |
+
return all_text
|