Spaces:
Build error
Build error
File size: 4,170 Bytes
f826667 26c4c4f c0b2a4d 053a53d f826667 053a53d f826667 c0b2a4d f826667 66cae2e f826667 66cae2e c0b2a4d f826667 e3b4042 c0b2a4d 26c4c4f c0b2a4d 053a53d ebae726 c0b2a4d 26c4c4f c0b2a4d da06f67 c0b2a4d 26c4c4f 6a78ac0 f826667 d3d92c6 c0b2a4d d3d92c6 da06f67 c0b2a4d 26c4c4f 053a53d 26c4c4f 053a53d ebae726 26c4c4f c0b2a4d 1c9be4e c0b2a4d 26c4c4f c0b2a4d ebae726 c0b2a4d 26c4c4f c0b2a4d f826667 c0b2a4d 26c4c4f c0b2a4d 309ee8b c0b2a4d f30a153 c0b2a4d f826667 e3b4042 c0b2a4d 66cae2e c0b2a4d 053a53d e3b4042 c0b2a4d da06f67 c0b2a4d 053a53d e3b4042 c0b2a4d 053a53d c0b2a4d da06f67 c0b2a4d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
import os
import dspy
import gradio as gr
import chromadb
import fitz # PyMuPDF
import json
from sentence_transformers import SentenceTransformer
from dspy import Example, MIPROv2, Evaluate, evaluate
from litellm import completion # Ensure LiteLLM is installed
from dspy.lm import LiteLLM
# إعداد نموذج اللغة باستخدام LiteLLM + Hugging Face
HF_TOKEN = os.environ.get("HF_TOKEN")
dspy.settings.configure(
lm=LiteLLM(
model="HuggingFaceH4/zephyr-7b-beta",
api_base="https://api-inference.huggingface.co/v1",
api_key=HF_TOKEN
)
)
# إعداد قاعدة بيانات Chroma
client = chromadb.PersistentClient(path="./chroma_db")
col = client.get_or_create_collection(name="arabic_docs")
# نموذج embeddings يدعم العربية
embedder = SentenceTransformer("sentence-transformers/LaBSE")
# تقطيع النصوص من PDF
def process_pdf(pdf_bytes):
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
texts = []
for page in doc:
text = page.get_text()
for chunk in text.split("\n\n"):
if len(chunk.strip()) > 50:
texts.append(chunk.strip())
return texts
# إدخال البيانات في Chroma
def ingest(pdf_file):
pdf_bytes = pdf_file
texts = process_pdf(pdf_bytes)
embeddings = embedder.encode(texts, show_progress_bar=True)
for i, (chunk, emb) in enumerate(zip(texts, embeddings)):
col.add(ids=[f"chunk_{i}"], embeddings=[emb.tolist()], metadatas=[{"text": chunk}])
return f"✅ تمت إضافة {len(texts)} مقطعاً."
# استرجاع السياق
def retrieve_context(question):
embedding = embedder.encode([question])[0]
results = col.query(query_embeddings=[embedding.tolist()], n_results=3)
context_list = [m["text"] for m in results["metadatas"][0]]
return "\n\n".join(context_list)
# توقيع وحدة RAG
class RagSig(dspy.Signature):
question: str = dspy.InputField()
context: str = dspy.InputField()
answer: str = dspy.OutputField()
# وحدة RAG
class RagMod(dspy.Module):
def __init__(self):
super().__init__()
self.predictor = dspy.Predict(RagSig)
def forward(self, question):
context = retrieve_context(question)
return self.predictor(question=question, context=context)
model = RagMod()
# توليد الإجابة
def answer(question):
out = model(question)
return out.answer
# تحميل بيانات التدريب والتحقق
def load_dataset(path):
with open(path, "r", encoding="utf-8") as f:
return [Example(**json.loads(l)).with_inputs("question") for l in f]
# تحسين النموذج
def optimize(train_file, val_file):
global model
trainset = load_dataset(train_file.name)
valset = load_dataset(val_file.name)
tp = MIPROv2(metric=evaluate.answer_exact_match, auto="light", num_threads=4)
optimized = tp.compile(model, trainset=trainset, valset=valset)
model = optimized
return "✅ تم تحسين النموذج!"
# واجهة Gradio
with gr.Blocks() as demo:
gr.Markdown("## 🧠 نظام RAG عربي باستخدام DSPy + ChromaDB + Hugging Face")
with gr.Tab("📥 تحميل وتخزين"):
pdf_input = gr.File(label="ارفع ملف PDF", type="binary")
ingest_btn = gr.Button("إضافة إلى قاعدة البيانات")
ingest_out = gr.Textbox(label="نتيجة الإضافة")
ingest_btn.click(ingest, inputs=pdf_input, outputs=ingest_out)
with gr.Tab("❓ سؤال"):
q = gr.Textbox(label="اكتب سؤالك بالعربية")
answer_btn = gr.Button("احصل على الإجابة")
out = gr.Textbox(label="الإجابة")
answer_btn.click(answer, inputs=q, outputs=out)
with gr.Tab("⚙️ تحسين النموذج"):
train_file = gr.File(label="trainset.jsonl")
val_file = gr.File(label="valset.jsonl")
opt_btn = gr.Button("ابدأ التحسين")
result = gr.Textbox(label="نتيجة التحسين")
opt_btn.click(optimize, inputs=[train_file, val_file], outputs=result)
demo.launch()
|