ramysaidagieb commited on
Commit
4254fda
·
verified ·
1 Parent(s): 026659a

Upload 5 files

Browse files
Files changed (5) hide show
  1. README.md +7 -8
  2. app.py +32 -0
  3. rag_pipeline.py +31 -0
  4. requirements.txt +7 -0
  5. utils.py +17 -0
README.md CHANGED
@@ -1,14 +1,13 @@
1
  ---
2
- title: Chat22GV2
3
- emoji: 🐨
4
- colorFrom: green
5
- colorTo: pink
6
  sdk: gradio
7
- sdk_version: 5.31.0
8
  app_file: app.py
9
  pinned: false
10
- license: mit
11
- short_description: ask questions
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
1
  ---
2
+ title: Arabic RAG Assistant
3
+ emoji: 📚
4
+ colorFrom: blue
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 4.25.0
8
  app_file: app.py
9
  pinned: false
 
 
10
  ---
11
 
12
+ # Arabic RAG Assistant
13
+ مساعد بحث عربي يعتمد على الذكاء الاصطناعي المفتوح المصدر للإجابة على الأسئلة بناءً على مستندات PDF / DOCX / TXT المرفوعة.
app.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from rag_pipeline import RAGPipeline
3
+ import time
4
+
5
+ rag = RAGPipeline()
6
+
7
+ def submit_question(user_question):
8
+ start_time = time.time()
9
+ response, passages = rag.answer_question(user_question)
10
+ end_time = time.time()
11
+ log = f"[⏱️] زمن الإجابة: {end_time - start_time:.2f} ثانية\n"
12
+ for i, passage in enumerate(passages):
13
+ log += f"[📘] المرجع {i+1}: {passage}\n"
14
+ return response, log
15
+
16
+ with gr.Blocks() as demo:
17
+ with gr.Row():
18
+ with gr.Column():
19
+ file_uploader = gr.File(file_types=[".pdf", ".docx", ".txt"], label="📂 رفع المستندات", file_count="multiple")
20
+ upload_btn = gr.Button("⬆️ رفع ومعالجة")
21
+ upload_log = gr.Textbox(label="🔍 سجل المعالجة", lines=10)
22
+
23
+ with gr.Column():
24
+ question = gr.Textbox(label="❓ اطرح سؤالك هنا")
25
+ submit_btn = gr.Button("🔎 إرسال السؤال")
26
+ answer = gr.Textbox(label="📜 الإجابة", lines=5)
27
+ sources = gr.Textbox(label="🧭 المراجع", lines=10)
28
+
29
+ upload_btn.click(fn=rag.load_and_index, inputs=[file_uploader], outputs=[upload_log])
30
+ submit_btn.click(fn=submit_question, inputs=[question], outputs=[answer, sources])
31
+
32
+ demo.launch()
rag_pipeline.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
3
+ from langchain.vectorstores import Chroma
4
+ from langchain.embeddings import HuggingFaceEmbeddings
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+ from utils import extract_text_from_files
7
+
8
+ class RAGPipeline:
9
+ def __init__(self):
10
+ print("[RAG] جاري تحميل النموذج والمحول...")
11
+ self.embedding_model = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-base")
12
+ self.generator = pipeline("text-generation", model="tiiuae/falcon-7b-instruct", trust_remote_code=True, device_map="auto")
13
+ self.db = None
14
+ print("[RAG] تم التحميل بنجاح.")
15
+
16
+ def load_and_index(self, files):
17
+ text = extract_text_from_files(files)
18
+ splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
19
+ chunks = splitter.split_text(text)
20
+ self.db = Chroma.from_texts(chunks, embedding=self.embedding_model)
21
+ return f"[RAG] تم بناء الفهرس لـ {len(chunks)} مقاطع."
22
+
23
+ def answer_question(self, question):
24
+ if self.db is None:
25
+ return "⚠️ لم يتم تحميل مستندات.", []
26
+ docs = self.db.similarity_search(question, k=3)
27
+ context = "\n".join([doc.page_content for doc in docs])
28
+ prompt = f"أجب عن السؤال التالي بناءً على المراجع التالية فقط:\n{context}\n\nالسؤال: {question}\nالإجابة:"
29
+ result = self.generator(prompt, max_new_tokens=200)[0]["generated_text"]
30
+ answer = result.split("الإجابة:")[-1].strip()
31
+ return answer, [doc.page_content for doc in docs]
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio==4.25.0
2
+ transformers==4.40.1
3
+ sentence-transformers
4
+ langchain==0.1.20
5
+ chromadb==0.4.24
6
+ PyPDF2
7
+ python-docx
utils.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PyPDF2 import PdfReader
2
+ import docx
3
+
4
+ def extract_text_from_files(files):
5
+ all_text = ""
6
+ for file in files:
7
+ if file.name.endswith(".pdf"):
8
+ reader = PdfReader(file)
9
+ for page in reader.pages:
10
+ all_text += page.extract_text() + "\n"
11
+ elif file.name.endswith(".docx"):
12
+ doc = docx.Document(file)
13
+ for para in doc.paragraphs:
14
+ all_text += para.text + "\n"
15
+ elif file.name.endswith(".txt"):
16
+ all_text += file.read().decode("utf-8") + "\n"
17
+ return all_text