ramysaidagieb commited on
Commit
8da3287
·
verified ·
1 Parent(s): c46782a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -117
app.py CHANGED
@@ -1,117 +0,0 @@
1
- import os
2
- import dspy
3
- import gradio as gr
4
- import chromadb
5
- import fitz # PyMuPDF
6
- import json
7
- from sentence_transformers import SentenceTransformer
8
- from dspy import Example, MIPROv2, Evaluate, evaluate
9
- from litellm import completion # Ensure LiteLLM is installed
10
- from dspy.lm import LiteLLM
11
-
12
- # إعداد نموذج اللغة باستخدام LiteLLM + Hugging Face
13
- HF_TOKEN = os.environ.get("HF_TOKEN")
14
- dspy.settings.configure(
15
- lm=LiteLLM(
16
- model="HuggingFaceH4/zephyr-7b-beta",
17
- api_base="https://api-inference.huggingface.co/v1",
18
- api_key=HF_TOKEN
19
- )
20
- )
21
-
22
- # إعداد قاعدة بيانات Chroma
23
- client = chromadb.PersistentClient(path="./chroma_db")
24
- col = client.get_or_create_collection(name="arabic_docs")
25
-
26
- # نموذج embeddings يدعم العربية
27
- embedder = SentenceTransformer("sentence-transformers/LaBSE")
28
-
29
- # تقطيع النصوص من PDF
30
- def process_pdf(pdf_bytes):
31
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
32
- texts = []
33
- for page in doc:
34
- text = page.get_text()
35
- for chunk in text.split("\n\n"):
36
- if len(chunk.strip()) > 50:
37
- texts.append(chunk.strip())
38
- return texts
39
-
40
- # إدخال البيانات في Chroma
41
- def ingest(pdf_file):
42
- pdf_bytes = pdf_file
43
- texts = process_pdf(pdf_bytes)
44
- embeddings = embedder.encode(texts, show_progress_bar=True)
45
- for i, (chunk, emb) in enumerate(zip(texts, embeddings)):
46
- col.add(ids=[f"chunk_{i}"], embeddings=[emb.tolist()], metadatas=[{"text": chunk}])
47
- return f"✅ تمت إضافة {len(texts)} مقطعاً."
48
-
49
- # استرجاع السياق
50
- def retrieve_context(question):
51
- embedding = embedder.encode([question])[0]
52
- results = col.query(query_embeddings=[embedding.tolist()], n_results=3)
53
- context_list = [m["text"] for m in results["metadatas"][0]]
54
- return "\n\n".join(context_list)
55
-
56
- # توقيع وحدة RAG
57
- class RagSig(dspy.Signature):
58
- question: str = dspy.InputField()
59
- context: str = dspy.InputField()
60
- answer: str = dspy.OutputField()
61
-
62
- # وحدة RAG
63
- class RagMod(dspy.Module):
64
- def __init__(self):
65
- super().__init__()
66
- self.predictor = dspy.Predict(RagSig)
67
-
68
- def forward(self, question):
69
- context = retrieve_context(question)
70
- return self.predictor(question=question, context=context)
71
-
72
- model = RagMod()
73
-
74
- # توليد الإجابة
75
- def answer(question):
76
- out = model(question)
77
- return out.answer
78
-
79
- # تحميل بيانات التدريب والتحقق
80
- def load_dataset(path):
81
- with open(path, "r", encoding="utf-8") as f:
82
- return [Example(**json.loads(l)).with_inputs("question") for l in f]
83
-
84
- # تحسين النموذج
85
- def optimize(train_file, val_file):
86
- global model
87
- trainset = load_dataset(train_file.name)
88
- valset = load_dataset(val_file.name)
89
- tp = MIPROv2(metric=evaluate.answer_exact_match, auto="light", num_threads=4)
90
- optimized = tp.compile(model, trainset=trainset, valset=valset)
91
- model = optimized
92
- return "✅ تم تحسين النموذج!"
93
-
94
- # واجهة Gradio
95
- with gr.Blocks() as demo:
96
- gr.Markdown("## 🧠 نظام RAG عربي باستخدام DSPy + ChromaDB + Hugging Face")
97
-
98
- with gr.Tab("📥 تحميل وتخزين"):
99
- pdf_input = gr.File(label="ارفع ملف PDF", type="binary")
100
- ingest_btn = gr.Button("إضافة إلى قاعدة البيانات")
101
- ingest_out = gr.Textbox(label="نتيجة الإضافة")
102
- ingest_btn.click(ingest, inputs=pdf_input, outputs=ingest_out)
103
-
104
- with gr.Tab("❓ سؤال"):
105
- q = gr.Textbox(label="اكتب سؤالك بالعربية")
106
- answer_btn = gr.Button("احصل على الإجابة")
107
- out = gr.Textbox(label="الإجابة")
108
- answer_btn.click(answer, inputs=q, outputs=out)
109
-
110
- with gr.Tab("⚙️ تحسين النموذج"):
111
- train_file = gr.File(label="trainset.jsonl")
112
- val_file = gr.File(label="valset.jsonl")
113
- opt_btn = gr.Button("ابدأ التحسين")
114
- result = gr.Textbox(label="نتيجة التحسين")
115
- opt_btn.click(optimize, inputs=[train_file, val_file], outputs=result)
116
-
117
- demo.launch()