ramysaidagieb commited on
Commit
c0b2a4d
·
verified ·
1 Parent(s): 23fb7a8

Upload 5 files

Browse files
Files changed (5) hide show
  1. README.md +19 -14
  2. app.py +88 -0
  3. requirements.txt +6 -0
  4. trainset.jsonl +2 -0
  5. valset.jsonl +2 -0
README.md CHANGED
@@ -1,14 +1,19 @@
1
- ---
2
- title: Ask1
3
- emoji: 📉
4
- colorFrom: blue
5
- colorTo: blue
6
- sdk: gradio
7
- sdk_version: 5.33.0
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- short_description: ask to answer
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
1
+ # 🧠 Arabic RAG System with DSPy + Gradio
2
+
3
+ This is a full Hugging Face Space project that allows:
4
+ - Uploading Arabic PDF documents.
5
+ - Storing and indexing chunks using ChromaDB.
6
+ - Asking questions and generating answers using DSPy with context retrieval.
7
+ - Improving answer accuracy using MIPROv2 optimization based on train/val sets.
8
+
9
+ ## 🚀 Usage
10
+
11
+ 1. Upload one or more Arabic PDFs.
12
+ 2. Ask a question in Arabic.
13
+ 3. Optionally upload `trainset.jsonl` and `valset.jsonl` to fine-tune the DSPy RAG module.
14
+
15
+ ## 📁 Files
16
+
17
+ - `app.py` — Main code
18
+ - `requirements.txt` — Dependencies
19
+ - `trainset.jsonl` / `valset.jsonl` — Example training and validation sets
app.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dspy, gradio as gr
2
+ import chromadb
3
+ from chromadb.config import Settings
4
+ import fitz # PyMuPDF
5
+ from sentence_transformers import SentenceTransformer
6
+ import json
7
+ from dspy import Example, MIPROv2, Evaluate, evaluate
8
+
9
+ # إعداد LLM
10
+ dspy.settings.configure(lm=dspy.OpenAI(model="gpt-4"))
11
+
12
+ # إعداد قاعدة البيانات
13
+ client = chromadb.Client(Settings(chroma_db_impl="duckdb+parquet", persist_directory="./chroma_db"))
14
+ col = client.get_or_create_collection(name="arabic_docs", metadata={"hnsw:space": "cosine"})
15
+
16
+ embedder = SentenceTransformer("sentence-transformers/LaBSE")
17
+
18
+ # استيراد وتقطيع PDF
19
+ def process_pdf(pdf_bytes):
20
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
21
+ texts = []
22
+ for p in doc:
23
+ text = p.get_text()
24
+ for chunk in text.split("\n\n"):
25
+ if len(chunk) > 50:
26
+ texts.append(chunk.strip())
27
+ return texts
28
+
29
+ def ingest(pdf_bytes):
30
+ texts = process_pdf(pdf_bytes)
31
+ embeddings = embedder.encode(texts, show_progress_bar=True)
32
+ for i, (chunk, emb) in enumerate(zip(texts, embeddings)):
33
+ col.add(ids=[f"chunk_{i}"], embeddings=[emb.tolist()], metadatas=[{"text": chunk}])
34
+ return f"تمت إضافة {len(texts)} مقطعاً"
35
+
36
+ retriever = dspy.Retrieve(lambda q: [m["text"] for m in col.query(q, n_results=3)["metadatas"]], k=1)
37
+
38
+ class RagSig(dspy.Signature):
39
+ question: str
40
+ context: str
41
+ answer: str
42
+
43
+ class RagMod(dspy.Module):
44
+ def __init__(self):
45
+ super().__init__()
46
+ self.predictor = dspy.Predict(RagSig)
47
+
48
+ def forward(self, question):
49
+ context = retriever(question)[0]
50
+ return self.predictor(question=question, context=context)
51
+
52
+ model = RagMod()
53
+
54
+ def answer(question):
55
+ out = model(question)
56
+ return out.answer
57
+
58
+ def load_dataset(path):
59
+ with open(path, "r", encoding="utf-8") as f:
60
+ return [Example(**json.loads(l)).with_inputs("question") for l in f]
61
+
62
+ def optimize(train_file, val_file):
63
+ trainset = load_dataset(train_file.name)
64
+ valset = load_dataset(val_file.name)
65
+ tp = MIPROv2(metric=evaluate.answer_exact_match, auto="light", num_threads=4)
66
+ optimized = tp.compile(model, trainset=trainset, valset=valset)
67
+ global model
68
+ model = optimized
69
+ return "✅ تم تحسين النموذج!"
70
+
71
+ with gr.Blocks() as demo:
72
+ gr.Markdown("## 🧠 نظام RAG عربي باستخدام DSPy")
73
+ with gr.Tab("📥 تحميل وتخزين"):
74
+ pdf_input = gr.File(label="ارفع ملف PDF")
75
+ ingest_btn = gr.Button("إضافة إلى قاعدة البيانات")
76
+ ingest_btn.click(ingest, inputs=pdf_input, outputs=gr.Textbox())
77
+ with gr.Tab("❓ سؤال"):
78
+ q = gr.Textbox(label="اكتب سؤالك")
79
+ answer_btn = gr.Button("احصل على الإجابة")
80
+ out = gr.Textbox(label="الإجابة")
81
+ answer_btn.click(answer, inputs=q, outputs=out)
82
+ with gr.Tab("⚙️ تحسين النموذج"):
83
+ train_file = gr.File(label="trainset.jsonl")
84
+ val_file = gr.File(label="valset.jsonl")
85
+ opt_btn = gr.Button("ابدأ التحسين")
86
+ result = gr.Textbox(label="نتيجة التحسين")
87
+ opt_btn.click(optimize, inputs=[train_file, val_file], outputs=result)
88
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ dspy-ai
2
+ chromadb
3
+ sentence-transformers
4
+ PyMuPDF
5
+ gradio
6
+ transformers
trainset.jsonl ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ {"question": "ما هو DSPy؟", "answer": "DSPy هو إطار مفتوح المصدر من جامعة ستانفورد لتصميم برامج LLMs."}
2
+ {"question": "كيف يعمل نظام RAG؟", "answer": "نظام RAG يعمل من خلال استرجاع المعلومات من قاعدة معرفة ثم توليد إجابة باستخدام نموذج لغة كبير."}
valset.jsonl ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ {"question": "ما فائدة Chroma في RAG؟", "answer": "Chroma تُستخدم لتخزين واسترجاع المقاطع النصية ذات الصلة بالسؤال."}
2
+ {"question": "ما هي وظيفة MIPROv2؟", "answer": "MIPROv2 هو محسن يُستخدم لتحسين دقة نموذج DSPy باستخدام بيانات تدريبية."}