File size: 4,090 Bytes
ebae726
 
c0b2a4d
 
 
d3d92c6
ebae726
c0b2a4d
d3d92c6
da06f67
c0b2a4d
d3d92c6
e3b4042
 
c0b2a4d
d3d92c6
c0b2a4d
 
d3d92c6
ebae726
 
c0b2a4d
183e91e
 
c0b2a4d
da06f67
c0b2a4d
 
 
6ec351d
6a78ac0
6ec351d
 
 
 
d3d92c6
c0b2a4d
 
d3d92c6
da06f67
c0b2a4d
ebae726
 
 
 
d3d92c6
ebae726
 
d3d92c6
c0b2a4d
1c9be4e
 
 
c0b2a4d
d3d92c6
c0b2a4d
 
 
 
 
 
ebae726
c0b2a4d
 
 
 
d3d92c6
c0b2a4d
 
 
 
d3d92c6
c0b2a4d
 
 
 
ebae726
c0b2a4d
309ee8b
c0b2a4d
 
 
 
 
 
 
f30a153
c0b2a4d
e3b4042
 
c0b2a4d
d3d92c6
c0b2a4d
d3d92c6
 
e3b4042
c0b2a4d
da06f67
c0b2a4d
 
d3d92c6
e3b4042
c0b2a4d
d3d92c6
 
c0b2a4d
 
 
da06f67
c0b2a4d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import dspy
import gradio as gr
import chromadb
import fitz  # PyMuPDF
import json
from sentence_transformers import SentenceTransformer
from dspy import Example, MIPROv2, evaluate

# إعداد النموذج المفتوح المصدر
dspy.settings.configure(lm=dspy.LM("mistralai/Mistral-7B-Instruct-v0.2"))

# إعداد Chroma
client = chromadb.PersistentClient(path="./chroma_db")
col = client.get_or_create_collection(name="arabic_docs")

# إعداد نموذج التضمين (Embeddings)
embedder = SentenceTransformer("sentence-transformers/LaBSE")

# تقطيع نصوص PDF
def process_pdf(pdf_bytes):
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    texts = []
    for p in doc:
        text = p.get_text()
        for chunk in text.split("\n\n"):
            if len(chunk.strip()) > 50:
                texts.append(chunk.strip())
    return texts

# إدخال البيانات إلى Chroma مع دعم NamedString
def ingest(pdf_file):
    if hasattr(pdf_file, "read"):
        pdf_bytes = pdf_file.read()
    else:
        pdf_bytes = pdf_file  # قد يكون bytes أو NamedString
    texts = process_pdf(pdf_bytes)
    embeddings = embedder.encode(texts, show_progress_bar=True)
    for i, (chunk, emb) in enumerate(zip(texts, embeddings)):
        col.add(ids=[f"chunk_{i}"], embeddings=[emb.tolist()], metadatas=[{"text": chunk}])
    return f"✅ تمت إضافة {len(texts)} مقطعاً."

# استرجاع السياق من Chroma
def retrieve_context(query):
    query_emb = embedder.encode([query])[0]
    results = col.query(query_embeddings=[query_emb.tolist()], n_results=1)
    context_list = [m["text"] for group in results["metadatas"] for m in group]
    return context_list[0] if context_list else ""

# تعريف توقيع DSPy
class RagSig(dspy.Signature):
    question: str = dspy.InputField()
    context: str = dspy.InputField()
    answer: str = dspy.OutputField()

# وحدة DSPy
class RagMod(dspy.Module):
    def __init__(self):
        super().__init__()
        self.predictor = dspy.Predict(RagSig)

    def forward(self, question):
        context = retrieve_context(question)
        return self.predictor(question=question, context=context)

model = RagMod()

# توليد إجابة
def answer(question):
    out = model(question)
    return out.answer

# تحميل بيانات التدريب/التقييم
def load_dataset(path):
    with open(path, "r", encoding="utf-8") as f:
        return [Example(**json.loads(l)).with_inputs("question") for l in f]

# تحسين النموذج
def optimize(train_file, val_file):
    global model
    trainset = load_dataset(train_file.name)
    valset = load_dataset(val_file.name)
    tp = MIPROv2(metric=evaluate.answer_exact_match, auto="light", num_threads=4)
    optimized = tp.compile(model, trainset=trainset, valset=valset)
    model = optimized
    return "✅ تم تحسين النموذج!"

# واجهة Gradio
with gr.Blocks() as demo:
    gr.Markdown("## 🧠 نظام RAG عربي باستخدام DSPy + نموذج مفتوح المصدر")

    with gr.Tab("📥 تحميل وتخزين"):
        pdf_input = gr.File(label="ارفع ملف PDF", file_types=[".pdf"])
        ingest_btn = gr.Button("إضافة إلى قاعدة البيانات")
        ingest_output = gr.Textbox()
        ingest_btn.click(fn=ingest, inputs=pdf_input, outputs=ingest_output)

    with gr.Tab("❓ سؤال"):
        q = gr.Textbox(label="اكتب سؤالك بالعربية")
        answer_btn = gr.Button("احصل على الإجابة")
        out = gr.Textbox(label="الإجابة")
        answer_btn.click(fn=answer, inputs=q, outputs=out)

    with gr.Tab("⚙️ تحسين النموذج"):
        train_file = gr.File(label="trainset.jsonl", file_types=[".jsonl"])
        val_file = gr.File(label="valset.jsonl", file_types=[".jsonl"])
        opt_btn = gr.Button("ابدأ التحسين")
        result = gr.Textbox(label="نتيجة التحسين")
        opt_btn.click(optimize, inputs=[train_file, val_file], outputs=result)

    demo.launch()