Spaces:
Build error
Build error
import dspy, gradio as gr | |
import chromadb | |
from chromadb.config import Settings | |
import fitz # PyMuPDF | |
from sentence_transformers import SentenceTransformer | |
import json | |
from dspy import Example, MIPROv2, Evaluate, evaluate | |
# إعداد LLM | |
dspy.settings.configure(lm=dspy.OpenAI(model="gpt-4")) | |
# إعداد قاعدة البيانات | |
client = chromadb.Client(Settings(chroma_db_impl="duckdb+parquet", persist_directory="./chroma_db")) | |
col = client.get_or_create_collection(name="arabic_docs", metadata={"hnsw:space": "cosine"}) | |
embedder = SentenceTransformer("sentence-transformers/LaBSE") | |
# استيراد وتقطيع PDF | |
def process_pdf(pdf_bytes): | |
doc = fitz.open(stream=pdf_bytes, filetype="pdf") | |
texts = [] | |
for p in doc: | |
text = p.get_text() | |
for chunk in text.split("\n\n"): | |
if len(chunk) > 50: | |
texts.append(chunk.strip()) | |
return texts | |
def ingest(pdf_bytes): | |
texts = process_pdf(pdf_bytes) | |
embeddings = embedder.encode(texts, show_progress_bar=True) | |
for i, (chunk, emb) in enumerate(zip(texts, embeddings)): | |
col.add(ids=[f"chunk_{i}"], embeddings=[emb.tolist()], metadatas=[{"text": chunk}]) | |
return f"تمت إضافة {len(texts)} مقطعاً" | |
retriever = dspy.Retrieve(lambda q: [m["text"] for m in col.query(q, n_results=3)["metadatas"]], k=1) | |
class RagSig(dspy.Signature): | |
question: str | |
context: str | |
answer: str | |
class RagMod(dspy.Module): | |
def __init__(self): | |
super().__init__() | |
self.predictor = dspy.Predict(RagSig) | |
def forward(self, question): | |
context = retriever(question)[0] | |
return self.predictor(question=question, context=context) | |
model = RagMod() | |
def answer(question): | |
out = model(question) | |
return out.answer | |
def load_dataset(path): | |
with open(path, "r", encoding="utf-8") as f: | |
return [Example(**json.loads(l)).with_inputs("question") for l in f] | |
def optimize(train_file, val_file): | |
trainset = load_dataset(train_file.name) | |
valset = load_dataset(val_file.name) | |
tp = MIPROv2(metric=evaluate.answer_exact_match, auto="light", num_threads=4) | |
optimized = tp.compile(model, trainset=trainset, valset=valset) | |
global model | |
model = optimized | |
return "✅ تم تحسين النموذج!" | |
with gr.Blocks() as demo: | |
gr.Markdown("## 🧠 نظام RAG عربي باستخدام DSPy") | |
with gr.Tab("📥 تحميل وتخزين"): | |
pdf_input = gr.File(label="ارفع ملف PDF") | |
ingest_btn = gr.Button("إضافة إلى قاعدة البيانات") | |
ingest_btn.click(ingest, inputs=pdf_input, outputs=gr.Textbox()) | |
with gr.Tab("❓ سؤال"): | |
q = gr.Textbox(label="اكتب سؤالك") | |
answer_btn = gr.Button("احصل على الإجابة") | |
out = gr.Textbox(label="الإجابة") | |
answer_btn.click(answer, inputs=q, outputs=out) | |
with gr.Tab("⚙️ تحسين النموذج"): | |
train_file = gr.File(label="trainset.jsonl") | |
val_file = gr.File(label="valset.jsonl") | |
opt_btn = gr.Button("ابدأ التحسين") | |
result = gr.Textbox(label="نتيجة التحسين") | |
opt_btn.click(optimize, inputs=[train_file, val_file], outputs=result) | |
demo.launch() | |