Spaces:
Build error
Build error
File size: 4,090 Bytes
ebae726 c0b2a4d d3d92c6 ebae726 c0b2a4d d3d92c6 da06f67 c0b2a4d d3d92c6 e3b4042 c0b2a4d d3d92c6 c0b2a4d d3d92c6 ebae726 c0b2a4d 183e91e c0b2a4d da06f67 c0b2a4d 6ec351d 6a78ac0 6ec351d d3d92c6 c0b2a4d d3d92c6 da06f67 c0b2a4d ebae726 d3d92c6 ebae726 d3d92c6 c0b2a4d 1c9be4e c0b2a4d d3d92c6 c0b2a4d ebae726 c0b2a4d d3d92c6 c0b2a4d d3d92c6 c0b2a4d ebae726 c0b2a4d 309ee8b c0b2a4d f30a153 c0b2a4d e3b4042 c0b2a4d d3d92c6 c0b2a4d d3d92c6 e3b4042 c0b2a4d da06f67 c0b2a4d d3d92c6 e3b4042 c0b2a4d d3d92c6 c0b2a4d da06f67 c0b2a4d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
import dspy
import gradio as gr
import chromadb
import fitz # PyMuPDF
import json
from sentence_transformers import SentenceTransformer
from dspy import Example, MIPROv2, evaluate
# إعداد النموذج المفتوح المصدر
dspy.settings.configure(lm=dspy.LM("mistralai/Mistral-7B-Instruct-v0.2"))
# إعداد Chroma
client = chromadb.PersistentClient(path="./chroma_db")
col = client.get_or_create_collection(name="arabic_docs")
# إعداد نموذج التضمين (Embeddings)
embedder = SentenceTransformer("sentence-transformers/LaBSE")
# تقطيع نصوص PDF
def process_pdf(pdf_bytes):
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
texts = []
for p in doc:
text = p.get_text()
for chunk in text.split("\n\n"):
if len(chunk.strip()) > 50:
texts.append(chunk.strip())
return texts
# إدخال البيانات إلى Chroma مع دعم NamedString
def ingest(pdf_file):
if hasattr(pdf_file, "read"):
pdf_bytes = pdf_file.read()
else:
pdf_bytes = pdf_file # قد يكون bytes أو NamedString
texts = process_pdf(pdf_bytes)
embeddings = embedder.encode(texts, show_progress_bar=True)
for i, (chunk, emb) in enumerate(zip(texts, embeddings)):
col.add(ids=[f"chunk_{i}"], embeddings=[emb.tolist()], metadatas=[{"text": chunk}])
return f"✅ تمت إضافة {len(texts)} مقطعاً."
# استرجاع السياق من Chroma
def retrieve_context(query):
query_emb = embedder.encode([query])[0]
results = col.query(query_embeddings=[query_emb.tolist()], n_results=1)
context_list = [m["text"] for group in results["metadatas"] for m in group]
return context_list[0] if context_list else ""
# تعريف توقيع DSPy
class RagSig(dspy.Signature):
question: str = dspy.InputField()
context: str = dspy.InputField()
answer: str = dspy.OutputField()
# وحدة DSPy
class RagMod(dspy.Module):
def __init__(self):
super().__init__()
self.predictor = dspy.Predict(RagSig)
def forward(self, question):
context = retrieve_context(question)
return self.predictor(question=question, context=context)
model = RagMod()
# توليد إجابة
def answer(question):
out = model(question)
return out.answer
# تحميل بيانات التدريب/التقييم
def load_dataset(path):
with open(path, "r", encoding="utf-8") as f:
return [Example(**json.loads(l)).with_inputs("question") for l in f]
# تحسين النموذج
def optimize(train_file, val_file):
global model
trainset = load_dataset(train_file.name)
valset = load_dataset(val_file.name)
tp = MIPROv2(metric=evaluate.answer_exact_match, auto="light", num_threads=4)
optimized = tp.compile(model, trainset=trainset, valset=valset)
model = optimized
return "✅ تم تحسين النموذج!"
# واجهة Gradio
with gr.Blocks() as demo:
gr.Markdown("## 🧠 نظام RAG عربي باستخدام DSPy + نموذج مفتوح المصدر")
with gr.Tab("📥 تحميل وتخزين"):
pdf_input = gr.File(label="ارفع ملف PDF", file_types=[".pdf"])
ingest_btn = gr.Button("إضافة إلى قاعدة البيانات")
ingest_output = gr.Textbox()
ingest_btn.click(fn=ingest, inputs=pdf_input, outputs=ingest_output)
with gr.Tab("❓ سؤال"):
q = gr.Textbox(label="اكتب سؤالك بالعربية")
answer_btn = gr.Button("احصل على الإجابة")
out = gr.Textbox(label="الإجابة")
answer_btn.click(fn=answer, inputs=q, outputs=out)
with gr.Tab("⚙️ تحسين النموذج"):
train_file = gr.File(label="trainset.jsonl", file_types=[".jsonl"])
val_file = gr.File(label="valset.jsonl", file_types=[".jsonl"])
opt_btn = gr.Button("ابدأ التحسين")
result = gr.Textbox(label="نتيجة التحسين")
opt_btn.click(optimize, inputs=[train_file, val_file], outputs=result)
demo.launch()
|