File size: 4,170 Bytes
f826667
26c4c4f
 
c0b2a4d
 
053a53d
f826667
053a53d
f826667
 
c0b2a4d
f826667
 
66cae2e
f826667
 
 
 
 
66cae2e
c0b2a4d
f826667
e3b4042
 
c0b2a4d
26c4c4f
c0b2a4d
 
053a53d
ebae726
 
c0b2a4d
26c4c4f
 
c0b2a4d
da06f67
c0b2a4d
 
 
26c4c4f
6a78ac0
f826667
d3d92c6
c0b2a4d
 
d3d92c6
da06f67
c0b2a4d
26c4c4f
053a53d
26c4c4f
 
 
053a53d
ebae726
26c4c4f
c0b2a4d
1c9be4e
 
 
c0b2a4d
26c4c4f
c0b2a4d
 
 
 
 
 
ebae726
c0b2a4d
 
 
 
26c4c4f
c0b2a4d
 
 
 
f826667
c0b2a4d
 
 
 
26c4c4f
c0b2a4d
309ee8b
c0b2a4d
 
 
 
 
 
 
f30a153
c0b2a4d
f826667
e3b4042
c0b2a4d
66cae2e
c0b2a4d
053a53d
 
e3b4042
c0b2a4d
da06f67
c0b2a4d
 
053a53d
e3b4042
c0b2a4d
053a53d
 
c0b2a4d
 
 
da06f67
c0b2a4d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import os
import dspy
import gradio as gr
import chromadb
import fitz  # PyMuPDF
import json
from sentence_transformers import SentenceTransformer
from dspy import Example, MIPROv2, Evaluate, evaluate
from litellm import completion  # Ensure LiteLLM is installed
from dspy.lm import LiteLLM

# إعداد نموذج اللغة باستخدام LiteLLM + Hugging Face
HF_TOKEN = os.environ.get("HF_TOKEN")
dspy.settings.configure(
    lm=LiteLLM(
        model="HuggingFaceH4/zephyr-7b-beta",
        api_base="https://api-inference.huggingface.co/v1",
        api_key=HF_TOKEN
    )
)

# إعداد قاعدة بيانات Chroma
client = chromadb.PersistentClient(path="./chroma_db")
col = client.get_or_create_collection(name="arabic_docs")

# نموذج embeddings يدعم العربية
embedder = SentenceTransformer("sentence-transformers/LaBSE")

# تقطيع النصوص من PDF
def process_pdf(pdf_bytes):
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    texts = []
    for page in doc:
        text = page.get_text()
        for chunk in text.split("\n\n"):
            if len(chunk.strip()) > 50:
                texts.append(chunk.strip())
    return texts

# إدخال البيانات في Chroma
def ingest(pdf_file):
    pdf_bytes = pdf_file
    texts = process_pdf(pdf_bytes)
    embeddings = embedder.encode(texts, show_progress_bar=True)
    for i, (chunk, emb) in enumerate(zip(texts, embeddings)):
        col.add(ids=[f"chunk_{i}"], embeddings=[emb.tolist()], metadatas=[{"text": chunk}])
    return f"✅ تمت إضافة {len(texts)} مقطعاً."

# استرجاع السياق
def retrieve_context(question):
    embedding = embedder.encode([question])[0]
    results = col.query(query_embeddings=[embedding.tolist()], n_results=3)
    context_list = [m["text"] for m in results["metadatas"][0]]
    return "\n\n".join(context_list)

# توقيع وحدة RAG
class RagSig(dspy.Signature):
    question: str = dspy.InputField()
    context: str = dspy.InputField()
    answer: str = dspy.OutputField()

# وحدة RAG
class RagMod(dspy.Module):
    def __init__(self):
        super().__init__()
        self.predictor = dspy.Predict(RagSig)

    def forward(self, question):
        context = retrieve_context(question)
        return self.predictor(question=question, context=context)

model = RagMod()

# توليد الإجابة
def answer(question):
    out = model(question)
    return out.answer

# تحميل بيانات التدريب والتحقق
def load_dataset(path):
    with open(path, "r", encoding="utf-8") as f:
        return [Example(**json.loads(l)).with_inputs("question") for l in f]

# تحسين النموذج
def optimize(train_file, val_file):
    global model
    trainset = load_dataset(train_file.name)
    valset = load_dataset(val_file.name)
    tp = MIPROv2(metric=evaluate.answer_exact_match, auto="light", num_threads=4)
    optimized = tp.compile(model, trainset=trainset, valset=valset)
    model = optimized
    return "✅ تم تحسين النموذج!"

# واجهة Gradio
with gr.Blocks() as demo:
    gr.Markdown("## 🧠 نظام RAG عربي باستخدام DSPy + ChromaDB + Hugging Face")

    with gr.Tab("📥 تحميل وتخزين"):
        pdf_input = gr.File(label="ارفع ملف PDF", type="binary")
        ingest_btn = gr.Button("إضافة إلى قاعدة البيانات")
        ingest_out = gr.Textbox(label="نتيجة الإضافة")
        ingest_btn.click(ingest, inputs=pdf_input, outputs=ingest_out)

    with gr.Tab("❓ سؤال"):
        q = gr.Textbox(label="اكتب سؤالك بالعربية")
        answer_btn = gr.Button("احصل على الإجابة")
        out = gr.Textbox(label="الإجابة")
        answer_btn.click(answer, inputs=q, outputs=out)

    with gr.Tab("⚙️ تحسين النموذج"):
        train_file = gr.File(label="trainset.jsonl")
        val_file = gr.File(label="valset.jsonl")
        opt_btn = gr.Button("ابدأ التحسين")
        result = gr.Textbox(label="نتيجة التحسين")
        opt_btn.click(optimize, inputs=[train_file, val_file], outputs=result)

    demo.launch()