ramysaidagieb commited on
Commit
bafba0e
·
verified ·
1 Parent(s): 426d264

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -16
app.py CHANGED
@@ -8,26 +8,25 @@ from sentence_transformers import SentenceTransformer
8
  from dspy import Example, MIPROv2, Evaluate, evaluate
9
  from dspy import LiteLLM
10
 
11
- # تحميل التوكن من secrets
12
- HF_TOKEN = os.environ.get("HF_TOKEN")
13
 
14
- # إعداد النموذج عبر LiteLLM من Hugging Face Inference Endpoints
15
  dspy.settings.configure(
16
  lm=LiteLLM(
17
- model="HuggingFaceH4/zephyr-7b-beta", # يمكنك تغييره لأي نموذج Instruct مفتوح
18
  api_base="https://api-inference.huggingface.co/v1",
19
  api_key=HF_TOKEN
20
  )
21
  )
22
 
23
- # إعداد ChromaDB
24
  client = chromadb.PersistentClient(path="./chroma_db")
25
  col = client.get_or_create_collection(name="arabic_docs")
26
 
27
- # نموذج Embedding يدعم العربية
28
  embedder = SentenceTransformer("sentence-transformers/LaBSE")
29
 
30
- # استخراج النصوص من PDF
31
  def process_pdf(pdf_bytes):
32
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
33
  texts = []
@@ -38,7 +37,6 @@ def process_pdf(pdf_bytes):
38
  texts.append(chunk.strip())
39
  return texts
40
 
41
- # إدخال النصوص إلى قاعدة Chroma
42
  def ingest(pdf_file):
43
  pdf_bytes = pdf_file
44
  texts = process_pdf(pdf_bytes)
@@ -47,20 +45,17 @@ def ingest(pdf_file):
47
  col.add(ids=[f"chunk_{i}"], embeddings=[emb.tolist()], metadatas=[{"text": chunk}])
48
  return f"✅ تمت إضافة {len(texts)} مقطعاً."
49
 
50
- # استرجاع السياق الأقرب للسؤال
51
  def retrieve_context(question):
52
  embedding = embedder.encode([question])[0]
53
  results = col.query(query_embeddings=[embedding.tolist()], n_results=3)
54
  context_list = [m["text"] for m in results["metadatas"][0]]
55
  return "\n\n".join(context_list)
56
 
57
- # تعريف توقيع وحدة RAG
58
  class RagSig(dspy.Signature):
59
  question: str = dspy.InputField()
60
  context: str = dspy.InputField()
61
  answer: str = dspy.OutputField()
62
 
63
- # وحدة RAG
64
  class RagMod(dspy.Module):
65
  def __init__(self):
66
  super().__init__()
@@ -72,17 +67,14 @@ class RagMod(dspy.Module):
72
 
73
  model = RagMod()
74
 
75
- # توليد الإجابة
76
  def answer(question):
77
  out = model(question)
78
  return out.answer
79
 
80
- # تحميل مجموعة بيانات التدريب
81
  def load_dataset(path):
82
  with open(path, "r", encoding="utf-8") as f:
83
  return [Example(**json.loads(l)).with_inputs("question") for l in f]
84
 
85
- # تحسين النموذج
86
  def optimize(train_file, val_file):
87
  global model
88
  trainset = load_dataset(train_file.name)
@@ -92,9 +84,8 @@ def optimize(train_file, val_file):
92
  model = optimized
93
  return "✅ تم تحسين النموذج!"
94
 
95
- # واجهة Gradio
96
  with gr.Blocks() as demo:
97
- gr.Markdown("## 🧠 نظام RAG عربي باستخدام DSPy + ChromaDB + HF Inference")
98
 
99
  with gr.Tab("📥 تحميل وتخزين"):
100
  pdf_input = gr.File(label="ارفع ملف PDF", type="binary")
 
8
  from dspy import Example, MIPROv2, Evaluate, evaluate
9
  from dspy import LiteLLM
10
 
11
+ # تحميل التوكن من Secrets
12
+ HF_TOKEN = os.environ["HF_TOKEN"]
13
 
14
+ # تهيئة النموذج عبر LiteLLM من Hugging Face API
15
  dspy.settings.configure(
16
  lm=LiteLLM(
17
+ model="HuggingFaceH4/zephyr-7b-beta", # اختر نموذج Instruct مدعوم
18
  api_base="https://api-inference.huggingface.co/v1",
19
  api_key=HF_TOKEN
20
  )
21
  )
22
 
23
+ # إعداد قاعدة بيانات Chroma
24
  client = chromadb.PersistentClient(path="./chroma_db")
25
  col = client.get_or_create_collection(name="arabic_docs")
26
 
27
+ # إعداد نموذج LaBSE للتضمين العربي
28
  embedder = SentenceTransformer("sentence-transformers/LaBSE")
29
 
 
30
  def process_pdf(pdf_bytes):
31
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
32
  texts = []
 
37
  texts.append(chunk.strip())
38
  return texts
39
 
 
40
  def ingest(pdf_file):
41
  pdf_bytes = pdf_file
42
  texts = process_pdf(pdf_bytes)
 
45
  col.add(ids=[f"chunk_{i}"], embeddings=[emb.tolist()], metadatas=[{"text": chunk}])
46
  return f"✅ تمت إضافة {len(texts)} مقطعاً."
47
 
 
48
  def retrieve_context(question):
49
  embedding = embedder.encode([question])[0]
50
  results = col.query(query_embeddings=[embedding.tolist()], n_results=3)
51
  context_list = [m["text"] for m in results["metadatas"][0]]
52
  return "\n\n".join(context_list)
53
 
 
54
  class RagSig(dspy.Signature):
55
  question: str = dspy.InputField()
56
  context: str = dspy.InputField()
57
  answer: str = dspy.OutputField()
58
 
 
59
  class RagMod(dspy.Module):
60
  def __init__(self):
61
  super().__init__()
 
67
 
68
  model = RagMod()
69
 
 
70
  def answer(question):
71
  out = model(question)
72
  return out.answer
73
 
 
74
  def load_dataset(path):
75
  with open(path, "r", encoding="utf-8") as f:
76
  return [Example(**json.loads(l)).with_inputs("question") for l in f]
77
 
 
78
  def optimize(train_file, val_file):
79
  global model
80
  trainset = load_dataset(train_file.name)
 
84
  model = optimized
85
  return "✅ تم تحسين النموذج!"
86
 
 
87
  with gr.Blocks() as demo:
88
+ gr.Markdown("## 🧠 نظام RAG عربي باستخدام DSPy + ChromaDB + Hugging Face Inference")
89
 
90
  with gr.Tab("📥 تحميل وتخزين"):
91
  pdf_input = gr.File(label="ارفع ملف PDF", type="binary")